{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "http://einext02:4040\n"
     ]
    }
   ],
   "source": [
    "import sys, glob, os\n",
    "SPARK_HOME = \"/wd/software/spark-2.3.1-bin-hadoop2.7\"\n",
    "#SPARK_HOME=os.environ['SPARK_HOME']\n",
    "sys.path.append(SPARK_HOME + \"/python\")\n",
    "sys.path.append(glob.glob(SPARK_HOME + \"/python/lib/py4j*.zip\")[0])\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.conf import SparkConf\n",
    "from pyspark.storagelevel import StorageLevel\n",
    "\n",
    "\n",
    "spark = (SparkSession\n",
    "         .builder\n",
    "         .config(\"spark.master\", \"local[*]\")\n",
    "         .config(\"spark.driver.memory\", \"56G\")\n",
    "         .config(\"spark.sql.shuffle.partitions\", 32)\n",
    "         .config(\"spark.local.dir\", \"/spark-scratch\")\n",
    "         .config(\"spark.executor.extraJavaOptions\", \"-XX:+UseG1GC\")\n",
    "         .enableHiveSupport()\n",
    "         .getOrCreate())\n",
    "\n",
    "sc = spark.sparkContext\n",
    "\n",
    "print(sc.uiWebUrl)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ERD\n",
    "https://drive.google.com/open?id=1dHAdBT84rEDf3WiE7FSfFrpnGQVL0wUSsWAGEH7ywZg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pyspark.sql.functions as F\n",
    "from pyspark.sql.window import Window\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import metrics\n",
    "\n",
    "\n",
    "pd.options.display.max_columns = 1000\n",
    "pd.options.display.max_rows = 10\n",
    "\n",
    "fast_mode = True\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "\n",
    "from IPython.core.magic import register_line_magic\n",
    "\n",
    "\n",
    "@register_line_magic\n",
    "def show(line, n = 5):\n",
    "    return eval(line).limit(n).toPandas()\n",
    "\n",
    "@register_line_magic\n",
    "def sql(line, n = 10):\n",
    "    return spark.sql(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_path = \"/data/kaggle/outbrain_ctr/parquet/\"\n",
    "\n",
    "def cache_df(df, name, sorage_level = StorageLevel.MEMORY_ONLY):\n",
    "    df.createOrReplaceTempView(name)\n",
    "    spark.catalog.cacheTable(name)\n",
    "\n",
    "def load(name, rebase_timestamp = False, cache = True):\n",
    "    df = spark.read.load(base_path + name)\n",
    "        \n",
    "    if rebase_timestamp and \"timestamp\" in df.columns:\n",
    "        df = df.withColumn(\"timestamp\"\n",
    "            , F.expr(\"cast(from_unixtime(cast((timestamp + 1465876799998)/1000 as int)) as timestamp)\"))\n",
    "    if cache:\n",
    "        cache_df(df, name)\n",
    "\n",
    "    df.alias(name)\n",
    "    print(\"Number of partitions for df %s: %d\" % (name, df.rdd.getNumPartitions()))\n",
    "        \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "clicks_test\r\n",
      "clicks_train\r\n",
      "df_null_removed\r\n",
      "df_null_removed2\r\n",
      "documents_categories\r\n",
      "documents_entities\r\n",
      "documents_meta\r\n",
      "documents_topics\r\n",
      "events\r\n",
      "lrModel\r\n",
      "lrModel2\r\n",
      "merged_enriched\r\n",
      "page_views\r\n",
      "page_views_sample\r\n",
      "promoted_content\r\n",
      "sample_submission\r\n"
     ]
    }
   ],
   "source": [
    "!ls -1 /data/kaggle/outbrain_ctr/parquet/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clicks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df clicks_train: 11\n",
      "+----------+------+-------+\n",
      "|display_id| ad_id|clicked|\n",
      "+----------+------+-------+\n",
      "|         1| 42337|      0|\n",
      "|         1|139684|      0|\n",
      "|         1|144739|      1|\n",
      "|         1|156824|      0|\n",
      "|         1|279295|      0|\n",
      "|         1|296965|      0|\n",
      "|         2|125211|      0|\n",
      "|         2|156535|      0|\n",
      "|         2|169564|      0|\n",
      "|         2|308455|      1|\n",
      "|         3| 71547|      0|\n",
      "|         3| 95814|      0|\n",
      "|         3|152141|      0|\n",
      "|         3|183846|      0|\n",
      "|         3|228657|      1|\n",
      "|         3|250082|      0|\n",
      "|         4|149930|      0|\n",
      "|         4|153623|      1|\n",
      "|         4|184709|      0|\n",
      "|         4|186849|      0|\n",
      "+----------+------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clicks_train = load(\"clicks_train\")\n",
    "clicks_train.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df clicks_test: 8\n",
      "+----------+------+\n",
      "|display_id| ad_id|\n",
      "+----------+------+\n",
      "|  17662676|141437|\n",
      "|  17662676|169868|\n",
      "|  17662676|236063|\n",
      "|  17662676|414970|\n",
      "|  17662677|114004|\n",
      "|  17662677|115935|\n",
      "|  17662677|133772|\n",
      "|  17662677|156669|\n",
      "|  17662678|123742|\n",
      "|  17662678|157185|\n",
      "|  17662678|169432|\n",
      "|  17662678|366611|\n",
      "|  17662679|111055|\n",
      "|  17662679|130942|\n",
      "|  17662679|160452|\n",
      "|  17662679|190318|\n",
      "|  17662679|207568|\n",
      "|  17662679|233512|\n",
      "|  17662680|141437|\n",
      "|  17662680|163614|\n",
      "+----------+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clicks_test = load(\"clicks_test\")\n",
    "clicks_test.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(87141731, 32225162)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clicks_train.count(), clicks_test.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Distinct count of ad_id in training and test dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 4 ms, total: 4 ms\n",
      "Wall time: 2.61 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(478950, 381385)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_train.select(\"ad_id\").distinct().count(), clicks_test.select(\"ad_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Common ad_id in training and test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 11.9 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "316035"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_train.select(\"ad_id\").intersect(clicks_test.select(\"ad_id\")).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.17134916160834857"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1- 316035/381385"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "17% of ad_id in testing dataset are unique."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate CTR on training dataset. Note, we cannot calculate the CTR on test dataset since clicked column is provded the value. In fact, the rask is to predict the probability of click."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+--------------------+----------+\n",
      "| ad_id|                 ctr|view_count|\n",
      "+------+--------------------+----------+\n",
      "| 71547| 0.08026711185308848|     14975|\n",
      "|152141|  0.1633811230585424|     20088|\n",
      "| 35982|  0.3194444444444444|       144|\n",
      "|220315|  0.0814290895901342|     11028|\n",
      "| 28347| 0.21975410210704327|     21879|\n",
      "| 55988| 0.08457994066780282|     15843|\n",
      "| 51445|  0.2615935392556823|     12011|\n",
      "|198151| 0.14902102973168962|     71708|\n",
      "| 47081| 0.46573604060913704|       788|\n",
      "| 86281| 0.16210065645514224|     11425|\n",
      "|  8748|  0.2739398855391825|     22191|\n",
      "|271623| 0.06908981676179032|      3329|\n",
      "|102724|  0.5151337359792925|     28975|\n",
      "| 73754| 0.04944791166586654|      2083|\n",
      "|232276|0.029014177382129903|      3033|\n",
      "|107571|  0.2857142857142857|        42|\n",
      "|167350| 0.16086459524056948|      9623|\n",
      "|279899|  0.3333333333333333|         3|\n",
      "|159405|  0.3200608122782886|     13813|\n",
      "| 14400|  0.4529262086513995|      1965|\n",
      "+------+--------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ctrs = clicks_train.groupBy(\"ad_id\")\\\n",
    ".agg(F.expr(\"sum(clicked)/count(*)\").alias(\"ctr\"), F.count(\"*\").alias(\"view_count\"))\n",
    "\n",
    "ctrs.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-------------------+\n",
      "|summary|                ctr|\n",
      "+-------+-------------------+\n",
      "|  count|             478950|\n",
      "|   mean| 0.1432241011540594|\n",
      "| stddev|0.21768444463682177|\n",
      "|    min|                0.0|\n",
      "|    max|                1.0|\n",
      "+-------+-------------------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 1.81 s\n"
     ]
    }
   ],
   "source": [
    "%time ctrs.select(\"ctr\").describe().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Median CTR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------------------------+\n",
      "|percentile(ctr, CAST(0.5 AS DOUBLE), 1)|\n",
      "+---------------------------------------+\n",
      "|                    0.05172413793103448|\n",
      "+---------------------------------------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 1.9 s\n"
     ]
    }
   ],
   "source": [
    "%time ctrs.selectExpr(\"percentile(ctr, 0.5)\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+------------------+----------+\n",
      "| ad_id|               ctr|view_count|\n",
      "+------+------------------+----------+\n",
      "|182320|0.7308917197452229|       628|\n",
      "+------+------------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ctrs.filter(\"ad_id = 182320\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find 99, 95 and 90 percentile values of the view counts of the ads."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3598.51,  305.  ,   81.  ])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "view_counts = ctrs.select(\"view_count\").toPandas()\n",
    "np.percentile(view_counts[\"view_count\"], [99, 95, 90])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To build confidence on the CTR, filter out the ads with fewer than 100 views (approx 99 percentile value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 628 ms, sys: 24 ms, total: 652 ms\n",
      "Wall time: 1.9 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Text(0,0.5,'Frquency (normalized)')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGRVJREFUeJzt3X+0XWV95/H3x4igUwULsTKQGH/EOkgV5YpYqwWKLaUKWhBil1YcbNY4ItKqHS1disxaU7EzWhWVFcEK1iqKv4IiDApotfLjhgYwpmhULKksiKAgIozB7/xxdg63l/tj35u7zzm5eb/Wuou993nO2Z8cIN/7PPvZz05VIUkSwEOGHUCSNDosCpKkPouCJKnPoiBJ6rMoSJL6LAqSpD6LgiSpz6IgSeqzKEiS+h467ABztddee9WKFSuGHUOSdijr1q37cVUtna3dDlcUVqxYwfj4+LBjSNIOJckP27Rz+EiS1GdRkCT1WRQkSX0WBUlSn0VBktRnUZAk9VkUJEl9FgVJUp9FQZLU1/kdzUmWAOPAv1fVCye9titwHnAgcDtwfFXd1HWmUbLizV+c8vhN7/ijASeRpMEsc/F6YCPwqCleOxH4SVU9Kckq4Azg+AFkGnkWC0nD0OnwUZJ9gT8Czp6mydHAuc32BcDvJUmXmSRJ0+u6p/B3wF8Cj5zm9X2AmwGqamuSO4E9gR9PbJRkNbAaYPny5Z2F7dJ0v/lL0ijprKeQ5IXAbVW1bqZmUxyrBx2oWlNVY1U1tnTprCu/SpLmqcvho+cCRyW5CfgEcFiSf5jUZjOwDCDJQ4HdgTs6zCRJmkFnw0dV9RbgLQBJDgHeWFUvn9RsLfBK4JvAscBlVfWgnoIe4AVoSV0a+EN2kpwOjFfVWuAc4KNJNtHrIawadB5J0gMGUhSq6grgimb7rROO3wu8dBAZJEmz845mSVKfRUGS1GdRkCT1WRQkSX0WBUlS38CnpC52LmchaUdmT0GS1GdPYZHwTmdJC8GegiSpz6IgSeqzKEiS+iwKkqQ+i4Ikqc+iIEnqsyhIkvosCpKkvs6KQpLdklyd5LokG5K8fYo2JyTZkmR98/PqrvJIkmbX5R3N9wGHVdXdSXYBvp7kS1V15aR251fVSR3mkCS11FlRqKoC7m52d2l+qqvzSZK2X6fXFJIsSbIeuA24tKqumqLZMUmuT3JBkmVd5pEkzazTolBV91fVAcC+wEFJ9p/U5EJgRVU9DfgycO5Un5NkdZLxJONbtmzpMrIk7dQGMvuoqn4KXAEcMen47VV1X7P7IeDAad6/pqrGqmps6dKlnWaVpJ1Zl7OPlibZo9l+OHA48K+T2uw9YfcoYGNXeSRJs+ty9tHewLlJltArPp+sqi8kOR0Yr6q1wMlJjgK2AncAJ3SYZ6fkcxYkzUWXs4+uB54xxfG3Tth+C/CWrjJIkubGO5olSX0WBUlSn0VBktRnUZAk9VkUJEl9XU5JXdSmm+opSTsyewqSpD6LgiSpz6IgSeprdU0hyaOB/wz8Aripqn7VaSpJ0lBMWxSS7A68FngZ8DBgC7Ab8BtJrgQ+UFWXDySlJGkgZuopXACcBzyvWfq6L8mBwCuSPKGqzukyoCRpcKYtClX1ghleWwes6ySRBsLVUyVNZabho2fO9Maqunbh40iShmmm4aP/0/xzN2AMuA4I8DTgKuB3uo0mSRq0aaekVtWhVXUo8EPgmc3jMA+k94yETYMKKEkanDb3KTylqm7YtlNV3wIO6C6SJGlY2hSFjUnOTnJIkt9N8iFaPEs5yW5Jrk5yXZINSd4+RZtdk5yfZFOSq5KsmPsfQZK0UNoUhVcBG4DXA6cA326OzeY+4LCqejq9nsURSQ6e1OZE4CdV9STg3cAZbYNLkhberHc0V9W9Sc4CLqqqG9t+cFUVcHezu0vzU5OaHQ2c1mxfAJyZJM17JUkDNmtPIclRwHrg4mb/gCRr23x4kiVJ1gO3AZdW1VWTmuwD3AxQVVuBO4E9p/ic1UnGk4xv2bKlzaklSfPQZvjobcBBwE8Bqmo9sKLNh1fV/VV1ALAvcFCS/Sc1yVRvm+Jz1jSzn8aWLl3a5tSSpHloUxS2VtWd23OSZpmMK4AjJr20GVgGkOShwO7AHdtzLknS/LUpCt9K8ifAkiQrk7wP+OfZ3pRkaZI9mu2HA4cD/zqp2Vrglc32scBlXk+QpOFpUxReBzyV3myif6Q37n9Ki/ftDVye5HrgGnrXFL6Q5PTmOgXAOcCeSTYBfwG8ea5/AEnSwmnzPIUDgbdW1anbDjTrIs249lFVXU/v7ufJx986Yfte4KWt00qSOtWmKFwCXJPkuKq6tTl2NjDjgnnaMU23eiq4gqq0M2gzfHQj8LfAFUl+uzk21awhSdIOrk1PoZprATcC5yf5MFNMG5Uk7fja9BQCUFXfBZ4HPJ/e8tmSpEWmzTIXz5iw/XPguCTLO00lSRqKmZ689pdV9c4k752myckdZZIkDclMPYVty2P7LGZJ2klMWxSq6sLmn+cOLo4kaZhmGj66kBlmGVXVUdO9tpjMNG9fkhabmYaP/vfAUkiSRsJMw0dfHWQQSdLwzTolNclK4G+A/YDdth2vqid0mEuSNARtbl77e+CDwFbgUOA84KNdhpIkDUebovDwqvoKkKr6YVWdBhzWbSxJ0jC0Wfvo3iQPAb6b5CTg34HHdBtLkjQMbXoKpwCPoHcH84HAK3jgaWmSpEWkzdpH1zSbdwOvavvBSZbRu/7wWOBXwJqqes+kNocAnwd+0Bz6TFWd3vYckqSF1Wb20RhwKvC4ie2raraVUrcCb6iqa5M8EliX5NKq+vakdv9UVS+cY25JUgfaXFP4GPAm4AZ6v/G3UlW3ALc02z9LshHYB5hcFCRJI6JNUdhSVWu35yRJVtB7XvNVU7z8nCTXAT8C3lhVG7bnXOrOdEt++JhOafFoUxTeluRs4CvAfdsOVtVn2pwgya8BnwZOqaq7Jr18LfC4qro7yZHA54CVU3zGamA1wPLlPspBkrrSpii8CngKsAsPDB8VMGtRSLILvYLwsamKyMQiUVUXJflAkr2q6seT2q0B1gCMjY35KFBJ6kibovD0qvqtuX5wkgDnABur6l3TtHkscGtVVZKD6E2RvX2u55IkLYw2ReHKJPtNMWtoNs+ld0/DDUnWN8f+ClgOUFVnAccCr0myFfgFsKqq7AlI0pC0KQq/A7wyyQ/oXVMIULNNSa2qrzdtZ2pzJnBmy6ySpI61KQpHdJ5CkjQSZiwKzZpHX6yq/QeURzsgp6pKi8eMax9V1a+A65I4D1SSdgJtho/2BjYkuRr4+baDO8szmiVpZ9KmKLy98xSSpJHQZpXUryb5DeBZzaGrq+q2bmNJkoZh1ucpJDkOuBp4KXAccFWSY7sOJkkavDbDR6cCz9rWO0iyFPgycEGXwSRJg9fmyWsPmTRcdHvL90mSdjBtegoXJ7kE+HizfzxwUXeRJEnD0uZC85uSHENvLaPQe6zmZztPJkkauDY9Barq0/SWwJYkLWJtZh/9cZLvJrkzyV1JfpZk8sNyJEmLQJuewjuBF1XVxq7DSJKGq80solstCJK0c2jTUxhPcj695yfP+RnNkqQdR5uewqOAe4DfB17U/LxwtjclWZbk8iQbk2xI8vop2iTJe5NsSnJ9kmfO9Q8gSVo4baakvmqen70VeENVXZvkkcC6JJdOeqznHwIrm59nAx9s/ilJGoJpi0KSvwY+UFV3TPP6YcAjquoLU71eVbcAtzTbP0uyEdgHmFgUjgbOa57LfGWSPZLs3bx3oKZ7UIwk7Uxm6incAFyY5F7gWmALsBu93+oPoLf+0f9qc5IkK4BnAFdNemkf4OYJ+5ubYwMvCpKkGYpCVX0e+HySlfTuZt4buAv4B2B1Vf2izQmS/Bq9G99OqarJ9zdkqlNP8RmrgdUAy5f7EDhJ6kqbawrfBb47nw9Psgu9gvCxaWYrbQaWTdjfF/jRFBnWAGsAxsbGHlQ0NJp8drO04+lstdMkAc4BNlbVu6Zpthb402YW0sHAncO4niBJ6mm19tE8PRd4BXBDkvXNsb8ClgNU1Vn0Vls9EthEb9rrfGc6SZIWwKxFIcmvTzcDaSZV9XWmvmYwsU0Br53rZ0uSutFm+OiqJJ9KcmQzJCRJWqTaDB89GTgc+K/A+5olLz5SVd/pNJkWLS9AS6Nr1p5C9VxaVS8DXg28Erg6yVeTPKfzhJKkgWlzTWFP4OX0LhrfCryO3qyhA4BPAY/vMqAkaXDaDB99E/go8OKq2jzh+HiSs7qJJUkahjZF4TebWUIPUlVnLHAeSdIQtZl99H+T7LFtJ8mjk1zSYSZJ0pC0KQpLq+qn23aq6ifAY7qLJEkaljZF4f4k/VXokjyOKRatkyTt+NpcUzgV+HqSrzb7z6dZsVSStLi0WSX14uYxmQfTW7biz6vqx50nkyQNXNsF8XYF7mja75eEqvpad7EkScPQ5ua1M4DjgQ3Ar5rDBVgUJGmRadNTeDG9exXu6zqMJGm42sw++j6wS9dBJEnD16ancA+wPslXgH5voapO7iyVJGko2hSFtc2P1CmX1JaGr82U1HOTPBxYXlU3tv3gJB8GXgjcVlX7T/H6IcDngR80hz5TVae3/XxJ0sKb9ZpCkhcB64GLm/0DkrTpOXwEOGKWNv9UVQc0PxYESRqyNsNHpwEHAVcAVNX6JLM+Q6GqvpZkxXZkkwCHlaRBajP7aGtV3Tnp2EKtffScJNcl+VKSp07XKMnqJONJxrds2bJAp5YkTdamKHwryZ8AS5KsTPI+4J8X4NzXAo+rqqcD7wM+N13DqlpTVWNVNbZ06dIFOLUkaSptisLrgKfSm476ceAu4JTtPXFV3VVVdzfbFwG7JNlrez9XkjR/bWYf3UNvpdRTF/LESR4L3FpVleQgegXq9oU8hyRpbtqsfXQ5U1xDqKrDZnnfx4FDgL2SbAbeRnNndFWdBRwLvCbJVuAXwKrpHvspSRqMNrOP3jhhezfgGGDrbG+qqpfN8vqZwJktzi9JGpA2w0frJh36xoQH7kiSFpE2w0e/PmH3IcCBwGM7SyRJGpo2w0fr6F1TCL1hox8AJ3YZSpI0HG2Gj2a9e1mStDi0GT7645ler6rPLFwcSdIwtRk+OhH4beCyZv9Qeusg3UlvWMmiIEmLRJuiUMB+VXULQJK9gfdX1as6TSZJGrg2y1ys2FYQGrcCT+4ojyRpiNr0FK5Icgm9dY8KWAVc3mkqSdJQtJl9dFKSlwDPbw6tqarPdhtLmp3PWZAW3oxFIckS4JKqOhywEEjSIjfjNYWquh+4J8nuA8ojSRqiNtcU7gVuSHIp8PNtB6vq5M5SSZKGok1R+GLzI0la5KYtCkmWV9W/VdW5gwwkbS8vQEvzN1NP4XPAMwGSfLqqjhlMJKkbFgtpdjNdaM6E7SfM9YOTfDjJbUm+Nc3rSfLeJJuSXJ/kmXM9hyRpYc1UFGqa7bY+Ahwxw+t/CKxsflYDH5zHOSRJC2im4aOnJ7mLXo/h4c02zX5V1aNm+uCq+lqSFTM0ORo4r3ku85VJ9kiy96QlNSRJAzRtUaiqJR2fex/g5gn7m5tjFgVJGpI2C+J1JVMcm3KYKsnqJONJxrds2dJxLEnaeQ2zKGwGlk3Y3xf40VQNq2pNVY1V1djSpUsHEk6SdkbDLAprgT9tZiEdDNzp9QRJGq42dzTPS5KPA4cAeyXZDLwN2AWgqs4CLgKOBDYB9wA+tEeShiy9yT87jrGxsRofH5/Xe6e7eUmaije1aTFJsq6qxmZrN8zhI0nSiLEoSJL6LAqSpD6LgiSpr7PZR9KObqaJCV6E1mJlT0GS1GdPQZoHn82gxcqegiSpz6IgSeqzKEiS+iwKkqQ+i4Ikqc+iIEnqsyhIkvosCpKkPouCJKnPoiBJ6uu0KCQ5IsmNSTYlefMUr5+QZEuS9c3Pq7vMI0maWZfPaF4CvB94AbAZuCbJ2qr69qSm51fVSV3lkAbJNZG0o+tyQbyDgE1V9X2AJJ8AjgYmFwVp0bNYaEfR5fDRPsDNE/Y3N8cmOybJ9UkuSLJsqg9KsjrJeJLxLVu2dJFVkkS3RSFTHKtJ+xcCK6rqacCXgXOn+qCqWlNVY1U1tnTp0gWOKUnapsuisBmY+Jv/vsCPJjaoqtur6r5m90PAgR3mkSTNosuicA2wMsnjkzwMWAWsndggyd4Tdo8CNnaYR5I0i84uNFfV1iQnAZcAS4APV9WGJKcD41W1Fjg5yVHAVuAO4ISu8kiSZpeqycP8o21sbKzGx8fn9d6ZHsQujRJnJWmhJVlXVWOztfOOZklSX5f3KUiaJ+9r0LDYU5Ak9VkUJEl9Dh9JOxCHldQ1ewqSpD6LgiSpz+EjaRGY6z04DjdpOvYUJEl9FgVJUp/DR9JOyFlMmo49BUlSnz0FSX32IGRRkDSr+awwbCHZMTl8JEnqs6cgqRMORe2YOi0KSY4A3kPvyWtnV9U7Jr2+K3AevWcz3w4cX1U3dZlJ0nBZLEZbZ0UhyRLg/cALgM3ANUnWVtW3JzQ7EfhJVT0pySrgDOD4rjJJGl0L9WREi8v26bKncBCwqaq+D5DkE8DRwMSicDRwWrN9AXBmktSO9oxQSSPDnsj26bIo7APcPGF/M/Ds6dpU1dYkdwJ7Aj/uMJekndAwn9G+IxWkLotCpjg2uQfQpg1JVgOrm927k9w4z0x7MboFx2zzM6rZRjUXmG2+5p0tZyxwkv+oba7HtfmwLovCZmDZhP19gR9N02ZzkocCuwN3TP6gqloDrNneQEnGq2psez+nC2abn1HNNqq5wGzzNarZFjpXl/cpXAOsTPL4JA8DVgFrJ7VZC7yy2T4WuMzrCZI0PJ31FJprBCcBl9CbkvrhqtqQ5HRgvKrWAucAH02yiV4PYVVXeSRJs+v0PoWqugi4aNKxt07Yvhd4aZcZJtnuIagOmW1+RjXbqOYCs83XqGZb0FxxtEaStI1rH0mS+hZlUUhyRJIbk2xK8uYpXt81yfnN61clWTFC2Z6f5NokW5McO6hcLbP9RZJvJ7k+yVeStJriNoBc/y3JDUnWJ/l6kv0GkatNtgntjk1SSQY2e6XF93ZCki3N97Y+yatHJVvT5rjmv7cNSf5xFHIlefeE7+s7SX46iFwtsy1PcnmSf2n+Hz1yXieqqkX1Q++i9veAJwAPA64D9pvU5r8DZzXbq4DzRyjbCuBp9NaEOnbEvrdDgUc0268ZxPfWMtejJmwfBVw8Kt9Z0+6RwNeAK4GxUckGnACcOaj/xuaYbSXwL8Cjm/3HjEKuSe1fR28Czah8Z2uA1zTb+wE3zedci7Gn0F9eo6r+H7BteY2JjgbObbYvAH4vyVQ30g08W1XdVFXXA78aQJ65Zru8qu5pdq+kd+/JKOS6a8Luf2KKGyCHla3xP4F3AvcOKNdcsg1Dm2x/Bry/qn4CUFW3jUiuiV4GfHwAuaBdtgIe1WzvzoPvC2tlMRaFqZbX2Ge6NlW1Fdi2vMYoZBuWuWY7EfhSp4l6WuVK8tok36P3l+/JA8jVKluSZwDLquoLA8q0Tdt/n8c0Qw0XJFk2xetdaJPtycCTk3wjyZXNisujkAuAZuj08cBlA8gF7bKdBrw8yWZ6sz5fN58TLcaisGDLa3RgWOdto3W2JC8HxoC/7TRRc7opjj0oV1W9v6qeCPwP4K87T9UzY7YkDwHeDbxhQHkmavO9XQisqKqnAV/mgd5z19pkeyi9IaRD6P1GfnaSPUYg1zargAuq6v4O80zUJtvLgI9U1b7AkfTuAZvz3/GLsSjMZXkNZlpeY0jZhqVVtiSHA6cCR1XVfaOSa4JPAC/uNNEDZsv2SGB/4IokNwEHA2sHdLF51u+tqm6f8O/wQ/SeazIIbf8f/XxV/bKqfgDcSK9IDDvXNqsY3NARtMt2IvBJgKr6JrAbvXWR5mYQF0kG+UPvN4zv0+vabbsg89RJbV7Lf7zQ/MlRyTah7UcY7IXmNt/bM+hd7Fo5YrlWTth+Eb075kci26T2VzC4C81tvre9J2y/BLhyhLIdAZzbbO9Fb+hkz2Hnatr9JnATzX1eI/SdfQk4odn+L/SKxpwzDuQPNOgfel2n7zR/gZ3aHDud3m+30KugnwI2AVcDTxihbM+i91vBz+k9jW7DCGX7MnArsL75WTsiud4DbGgyXT7TX8yDzjap7cCKQsvv7W+a7+265nt7yghlC/Aues9fuQFYNQq5mv3TgHcM6ruaw3e2H/CN5t/neuD353Me72iWJPUtxmsKkqR5sihIkvosCpKkPouCJKnPoiBJ6uv0ITvSYpLkscDf0Zs2fB+96bnPpjdNcDm95VLupPcQ9VcDG+nddPUwYBw4sap+OfjkUnv2FKQWmgUTPwtcUVVPrKr9gD8H/qCqDqD3vPE3VdUBVXV487bvNa/9Fr07UI8bRnZpLuwpSO0cCvyyqs7adqCq1rd5Y1Xdn+RqRmfxQ2la9hSkdvYH1s3njUl2ozfMdPGCJpI6YFGQuvPEJOvpLVfyb9V7ToY00iwKUjsbmPsqotuuKTwJODjJUQsfS1pYFgWpncuAXZP82bYDSZ6V5Hdne2NV3QK8GXhLh/mkBWFRkFqo3sqRLwFekOR7STbQWy2z7fMwPgc8IsnzOoooLQhXSZUk9dlTkCT1WRQkSX0WBUlSn0VBktRnUZAk9VkUJEl9FgVJUp9FQZLU9/8BkwWqqyRGmbIAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%time ctrs.filter(\"view_count>100\").select(\"ctr\").toPandas()[\"ctr\"].plot.hist(bins = 50, density = True)\n",
    "plt.xlabel(\"CTR\")\n",
    "plt.ylabel(\"Frquency (normalized)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Consier, the CTR as baseline for click prediction. Using CTR as based, calculate the MAP (mean avg precision)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#y_pred = clicks_train.join(ctrs.select(\"ad_id\", \"ctr\"), on = [\"ad_id\"], how=\"left\").select(\"ctr\").toPandas()[\"ctr\"]\n",
    "#y_true = clicks_train.select(\"clicked\").toPandas()[\"clicked\"]\n",
    "\n",
    "#%time metrics.average_precision_score(y_true, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+----------+-------------------+\n",
      "|ad_id|display_id|                ctr|\n",
      "+-----+----------+-------------------+\n",
      "|   18|  18637093|                0.0|\n",
      "|   38|  19842301|               null|\n",
      "|   38|  21227007|               null|\n",
      "|   70|  22445644|               null|\n",
      "|   93|  17994591|0.11956521739130435|\n",
      "|   93|  18082288|0.11956521739130435|\n",
      "|   93|  18138846|0.11956521739130435|\n",
      "|   93|  18139717|0.11956521739130435|\n",
      "|   93|  18146038|0.11956521739130435|\n",
      "|   93|  18209662|0.11956521739130435|\n",
      "|   93|  18242010|0.11956521739130435|\n",
      "|   93|  18265668|0.11956521739130435|\n",
      "|   93|  18299525|0.11956521739130435|\n",
      "|   93|  18300420|0.11956521739130435|\n",
      "|   93|  18341642|0.11956521739130435|\n",
      "|   93|  18419095|0.11956521739130435|\n",
      "|   93|  18422601|0.11956521739130435|\n",
      "|   93|  18427146|0.11956521739130435|\n",
      "|   93|  18480210|0.11956521739130435|\n",
      "|   93|  18525463|0.11956521739130435|\n",
      "+-----+----------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clicks_test_baseline = clicks_test.join(ctrs.select(\"ad_id\", \"ctr\"), on = [\"ad_id\"], how=\"left\")\n",
    "clicks_test_baseline.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[display_id: int, count: bigint]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clicks_test_baseline.groupBy(\"display_id\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|           avg(ap)|\n",
      "+------------------+\n",
      "|0.9835441908732905|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clicks_test_baseline.groupBy(\"display_id\").agg(F.sum(\"ctr\").alias(\"ap\")).selectExpr(\"avg(ap)\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many ads are there for each display_id?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+\n",
      "|count|\n",
      "+-----+\n",
      "|    9|\n",
      "|    3|\n",
      "|    6|\n",
      "|    7|\n",
      "|    5|\n",
      "|    8|\n",
      "|   11|\n",
      "|    4|\n",
      "|    2|\n",
      "|   12|\n",
      "|   10|\n",
      "+-----+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 4.03 s\n"
     ]
    }
   ],
   "source": [
    "%time clicks_train.groupBy(\"display_id\").count().select(\"count\").distinct().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does each display_id in the training dataset has atleast one click?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 4.31 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_train.groupBy(\"display_id\").agg(F.sum(\"clicked\").alias(\"clicks\")).filter(\"clicks=0\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, each display_id has atleast one click. Does it display_id in the training dataset has more than click?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
      "Wall time: 3.85 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_train.groupBy(\"display_id\").agg(F.sum(\"clicked\").alias(\"clicks\")).filter(\"clicks>1\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, each display_id in the clicks dataset has only one click."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Page Views"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading page_view sample dataset\n",
      "Number of partitions for df page_views_sample: 8\n",
      "root\n",
      " |-- uuid: string (nullable = true)\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- platform: integer (nullable = true)\n",
      " |-- geo_location: string (nullable = true)\n",
      " |-- traffic_source: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "if fast_mode:\n",
    "    print(\"Loading page_view sample dataset\")\n",
    "    page_views = load(\"page_views_sample\", rebase_timestamp=True, cache=True)\n",
    "    #page_views = page_views.sample(False, 0.01, 1)\n",
    "    #cache_df(page_views, \"page_views\")\n",
    "else:\n",
    "    print(\"Loading full page_view dataset\")\n",
    "    page_views = load(\"page_views\", rebase_timestamp=True, cache=False)\n",
    "page_views.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+-----------+-------------------+--------+------------+--------------+\n",
      "|          uuid|document_id|          timestamp|platform|geo_location|traffic_source|\n",
      "+--------------+-----------+-------------------+--------+------------+--------------+\n",
      "|289a8ac9e3b3be|     647313|2016-06-14 23:30:43|       1|       PK>08|             1|\n",
      "|2cd20abba0d948|     647313|2016-06-14 18:19:16|       3|   US>CA>804|             1|\n",
      "|6149469c04c102|     647313|2016-06-14 10:12:32|       1|       AU>01|             1|\n",
      "|f3bca9d885121d|     647313|2016-06-15 06:07:45|       2|   US>KS>616|             1|\n",
      "|594de491d7a3d1|     647313|2016-06-14 13:09:20|       1|   US>CA>807|             1|\n",
      "|b5fe86af022f0c|     647313|2016-06-14 12:13:02|       3|       GB>H9|             1|\n",
      "|56bb88f5111d21|     647313|2016-06-14 11:02:45|       1|       SI>61|             1|\n",
      "|82a3dae458fd43|     647313|2016-06-14 15:46:58|       1|       MU>17|             1|\n",
      "|4323bb94758e52|     647313|2016-06-14 10:04:50|       1|       PL>78|             1|\n",
      "|111a86a1682ab8|     647313|2016-06-14 14:13:52|       1|       FR>A8|             1|\n",
      "|4cf1b402a2fbab|     647313|2016-06-14 13:03:47|       1|       ZA>11|             1|\n",
      "|9ee101729fa2b5|     647313|2016-06-14 11:10:40|       1|       AE>03|             1|\n",
      "|d01385dc210da5|     647313|2016-06-14 17:29:34|       1|       GB>H9|             1|\n",
      "|875f65b5884501|     647313|2016-06-15 06:57:43|       2|          KR|             1|\n",
      "|f2654a6840add7|     647313|2016-06-14 22:14:46|       1|          GB|             1|\n",
      "|326dc1e6a53219|     647776|2016-06-15 05:55:39|       2|       US>FL|             1|\n",
      "|43ff58e09ff220|     647776|2016-06-14 23:22:53|       1|   US>VA>511|             1|\n",
      "|db647197db75cb|     647776|2016-06-15 07:33:25|       2|   US>FL>571|             1|\n",
      "|a67495e38150ba|     647776|2016-06-15 02:27:59|       1|   US>PA>508|             1|\n",
      "|527f6b4676bbc7|     647776|2016-06-15 06:38:04|       3|   US>FL>534|             1|\n",
      "+--------------+-----------+-------------------+--------+------------+--------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "page_views.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Page views table is nearly 100 GB in decompressed csv file. How many records are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9999999"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does each record in page_views have timestamp?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 47.7 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time page_views.filter(\"isnull(timestamp)\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---------+---------+\n",
      "|  users|documents|locations|\n",
      "+-------+---------+---------+\n",
      "|9202149|    59849|     2964|\n",
      "+-------+---------+---------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 3.37 s\n"
     ]
    }
   ],
   "source": [
    "stats = page_views.selectExpr(\"count(distinct(uuid)) as users\"\n",
    "                      , \"count(distinct(document_id)) as documents\"\n",
    "                      , \"count(distinct(geo_location)) as locations\")\n",
    "%time stats.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some users are more frequent visitor that the others. Find the number of visitors based on the view counts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>view_count</th>\n",
       "      <th>num_users</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1</td>\n",
       "      <td>8546933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2</td>\n",
       "      <td>549587</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>80770</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4</td>\n",
       "      <td>16394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>6165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>13 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    view_count  num_users\n",
       "11           1    8546933\n",
       "9            2     549587\n",
       "1            3      80770\n",
       "8            4      16394\n",
       "4            5       6165\n",
       "..         ...        ...\n",
       "0            9         90\n",
       "12          10         22\n",
       "6           11          6\n",
       "10          12          2\n",
       "7           14          1\n",
       "\n",
       "[13 rows x 2 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views_by_user = page_views.groupBy(\"uuid\").count().groupBy(\"count\").count()\\\n",
    "                    .toDF(\"view_count\", \"num_users\").toPandas().sort_values(\"view_count\")\n",
    "page_views_by_user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f19c90187b8>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAENCAYAAADKcIhSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAH4dJREFUeJzt3X+UVOWd5/H3xwYFI8oPwR+0EZyQjGgUtUWczGRUEkCMwsSYIeYMPS4JOQ4Zzcy4EzLmSCbRXd2dEzNujFk2ouCPoGKMbKJBgppMMgo00oJIXDpIoIORllaCv2LQ7/5xn9ayraar6lZT0Hxe59Spe7/3eZ7vrYLub98fVY8iAjMzszwOqPUOmJnZvs/FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8utT613YE85/PDDY8SIEbXeDTOzfcqqVateiIih3bXbb4rJiBEjaGpqqvVumJntUyT9ppR2Ps1lZma5uZiYmVluLiZmZpbbfnPNxMz2TX/84x9pbW3l9ddfr/Wu9Gr9+vWjvr6evn37VtTfxcTM9mqtra0MGDCAESNGIKnWu9MrRQTbt2+ntbWVkSNHVjSGT3OZ2V7t9ddfZ8iQIS4kPUgSQ4YMyXX052JiZns9F5Kel/c9djExM7PcfM0EGDH7x2W133TteT20J2bWnXJ/Xrvjn+fq8JGJmVkv9+abb/Z4DhcTM7NubNq0ieOPP57Pf/7znHDCCUyYMIHXXnuNs8466+2vaXrhhRfo+P6/W2+9lalTp3L++eczcuRIvv3tb/PNb36TU045hXHjxtHe3t5lrq7GXLduHWPHjmXMmDGcdNJJbNiwAYDbb7/97fgXvvCFtwvHIYccwlVXXcUZZ5zBY489xuzZsxk9ejQnnXQSV1xxRdXfIxcTM7MSbNiwgVmzZrFu3ToGDhzIvffeu9v2Tz31FHfeeScrVqzgyiuv5OCDD2b16tWceeaZLFiwoOz83/3ud7n88stpbm6mqamJ+vp61q9fz1133cUvf/lLmpubqaur44477gDglVde4cQTT2T58uWMHj2a++67j3Xr1rFmzRq++tWvVvQe7I6vmZiZlWDkyJGMGTMGgNNOO41Nmzbttv3ZZ5/NgAEDGDBgAIcddhjnn38+AB/+8IdZs2ZN2fnPPPNMrrnmGlpbW/nkJz/JqFGjWLZsGatWreL0008H4LXXXmPYsGEA1NXVceGFFwJw6KGH0q9fPz73uc9x3nnn8YlPfKLs/N3xkYmZWQkOOuigt5fr6urYtWsXffr04a233gJ4z2c0CtsfcMABb68fcMAB7Nq1q8s8XY158cUXs3jxYvr378/EiRN5+OGHiQgaGxtpbm6mubmZZ555hq997WtA9on2urq6t8dcsWIFF154IT/84Q+ZNGlSjneiOBcTM7MKjRgxglWrVgGwaNGiHh1z48aNHHfccVx22WVccMEFrFmzhvHjx7No0SK2bdsGQHt7O7/5zXu/Mf7ll19mx44dTJ48mW9961s0NzdXZV8L+TSXme1T9qZbea+44go+/elPc9ttt3HOOef06Jh33XUXt99+O3379uXII4/kqquuYvDgwVx99dVMmDCBt956i759+3LjjTdy7LHHvmvMnTt3MmXKFF5//XUiguuvv74q+1pIEVH1QfdGDQ0N0dXkWP6cidnea/369Rx//PG13o39QrH3WtKqiGjorq9Pc5mZWW4lFRNJ/yBpnaSnJH1fUj9JIyUtl7RB0l2SDkxtD0rrLWn7iIJxvpLiz0iaWBCflGItkmYXxMvOYWa2L5g1axZjxox51+OWW26p9W5VrNtrJpKGA5cBoyPiNUl3A9OAycD1EbFQ0neBGcBN6fnFiPiApGnAdcBfSxqd+p0AHA38VNIHU5obgY8DrcBKSYsj4unUt+QcVXlHzMz2gBtvvLHWu1BVpZ7m6gP0l9QHOBh4DjgH6LjVYD4wNS1PSeuk7eOVfR3lFGBhRPwhIp4FWoCx6dESERsj4g1gITAl9Sk3h5n1QvvLtd1ayvsed1tMIuK3wL8Bm8mKyA5gFfBSRHTcLN0KDE/Lw4Etqe+u1H5IYbxTn67iQyrI8S6SZkpqktTU1tbW3Us1s71Qv3792L59uwtKD+qYHKtfv34Vj1HKaa5BZEcCI4GXgHuAc4vtT0eXLrZ1FS9W0HbXfnc53h2ImAvMhexuriJ9zGwvV19fT2trK/6DsGd1TNtbqVI+Z/Ix4NmIaAOQ9APgz4CBkvqkI4N6YGtq3wocA7Sm02KHAe0F8Q6FfYrFX6ggh5n1Mn379q14Klnbc0q5ZrIZGCfp4HRdYjzwNPAI8KnUphG4Py0vTuuk7Q9Hdny6GJiW7sQaCYwCVgArgVHpzq0DyS7SL059ys1hZmY10O2RSUQsl7QIeALYBawmO3X0Y2ChpKtT7ObU5WbgNkktZEcL09I469KdYE+ncWZFxJsAkr4ILAHqgHkRsS6N9eVycpiZWW34E/D4E/BmZl3xJ+DNzGyPcTExM7PcXEzMzCw3FxMzM8vNxcTMzHJzMTEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8ut22Ii6UOSmgsev5f0JUmDJS2VtCE9D0rtJekGSS2S1kg6tWCsxtR+g6TGgvhpktamPjek6YGpJIeZme153RaTiHgmIsZExBjgNOBV4D5gNrAsIkYBy9I6wLlk87uPAmYCN0FWGIA5wBnAWGBOR3FIbWYW9JuU4mXlMDOz2ij3NNd44NcR8RtgCjA/xecDU9PyFGBBZB4HBko6CpgILI2I9oh4EVgKTErbDo2IxyKbQ3hBp7HKyWFmZjVQbjGZBnw/LR8REc8BpOdhKT4c2FLQpzXFdhdvLRKvJMe7SJopqUlSU1tbWxkv08zMylFyMZF0IHABcE93TYvEooJ4JTneHYiYGxENEdEwdOjQboY0M7NKlXNkci7wREQ8n9af7zi1lJ63pXgrcExBv3pgazfx+iLxSnKYmVkNlFNMPsM7p7gAFgMdd2Q1AvcXxKenO67GATvSKaolwARJg9KF9wnAkrRtp6Rx6S6u6Z3GKieHmZnVQJ9SGkk6GPg48IWC8LXA3ZJmAJuBi1L8AWAy0EJ259clABHRLukbwMrU7usR0Z6WLwVuBfoDD6ZH2TnMzKw2SiomEfEqMKRTbDvZ3V2d2wYwq4tx5gHzisSbgBOLxMvOYWZme54/AW9mZrm5mJiZWW4uJmZmlpuLiZmZ5eZiYmZmubmYmJlZbi4mZmaWm4uJmZnl5mJiZma5uZiYmVluLiZmZpabi4mZmeXmYmJmZrm5mJiZWW4uJmZmlltJxUTSQEmLJP1K0npJZ0oaLGmppA3peVBqK0k3SGqRtEbSqQXjNKb2GyQ1FsRPk7Q29bkhzbhIJTnMzGzPK/XI5N+Bn0TEnwInA+uB2cCyiBgFLEvrkM0VPyo9ZgI3QVYYgDnAGcBYYE5HcUhtZhb0m5TiZeUwM7Pa6LaYSDoU+ChwM0BEvBERLwFTgPmp2XxgalqeAiyIzOPAQElHAROBpRHRHhEvAkuBSWnboRHxWJpBcUGnscrJYWZmNVDKkclxQBtwi6TVkr4n6X3AERHxHEB6HpbaDwe2FPRvTbHdxVuLxKkgx7tImimpSVJTW1tbCS/VzMwqUUox6QOcCtwUEacAr/DO6aZiVCQWFcR3p6Q+ETE3IhoiomHo0KHdDGlmZpUqpZi0Aq0RsTytLyIrLs93nFpKz9sK2h9T0L8e2NpNvL5InApymJlZDXRbTCLid8AWSR9KofHA08BioOOOrEbg/rS8GJie7rgaB+xIp6iWABMkDUoX3icAS9K2nZLGpbu4pncaq5wcZmZWA31KbPf3wB2SDgQ2ApeQFaK7Jc0ANgMXpbYPAJOBFuDV1JaIaJf0DWBlavf1iGhPy5cCtwL9gQfTA+DacnKYmVltlFRMIqIZaCiyaXyRtgHM6mKcecC8IvEm4MQi8e3l5jAzsz3Pn4A3M7PcXEzMzCw3FxMzM8vNxcTMzHJzMTEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8utpGIiaZOktZKaJTWl2GBJSyVtSM+DUlySbpDUImmNpFMLxmlM7TdIaiyIn5bGb0l9VWkOMzPb88o5Mjk7IsZERMckWbOBZRExCliW1gHOBUalx0zgJsgKAzAHOAMYC8zpKA6pzcyCfpMqyWFmZrWR5zTXFGB+Wp4PTC2IL4jM48BASUcBE4GlEdEeES8CS4FJaduhEfFYmkFxQaexyslhZmY1UGoxCeAhSaskzUyxIyLiOYD0PCzFhwNbCvq2ptju4q1F4pXkeBdJMyU1SWpqa2sr8aWamVm5SpoDHvhIRGyVNAxYKulXu2mrIrGoIL47JfWJiLnAXICGhobuxjQzswqVdGQSEVvT8zbgPrJrHs93nFpKz9tS81bgmILu9cDWbuL1ReJUkMPMzGqg22Ii6X2SBnQsAxOAp4DFQMcdWY3A/Wl5MTA93XE1DtiRTlEtASZIGpQuvE8AlqRtOyWNS3dxTe80Vjk5zMysBko5zXUEcF+6W7cPcGdE/ETSSuBuSTOAzcBFqf0DwGSgBXgVuAQgItolfQNYmdp9PSLa0/KlwK1Af+DB9AC4tpwcZmZWG90Wk4jYCJxcJL4dGF8kHsCsLsaaB8wrEm8CTqxGDjMz2/P8CXgzM8vNxcTMzHJzMTEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8vNxcTMzHJzMTEzs9xKLiaS6iStlvSjtD5S0nJJGyTdJenAFD8orbek7SMKxvhKij8jaWJBfFKKtUiaXRAvO4eZme155RyZXA6sL1i/Drg+IkYBLwIzUnwG8GJEfAC4PrVD0mhgGnACMAn4TipQdcCNwLnAaOAzqW3ZOczMrDZKKiaS6oHzgO+ldQHnAItSk/nA1LQ8Ja2Tto9P7acACyPiDxHxLNmUu2PToyUiNkbEG8BCYEqFOczMrAZKPTL5FvDPwFtpfQjwUkTsSuutwPC0PBzYApC270jt34536tNVvJIc7yJppqQmSU1tbW0lvlQzMytXt8VE0ieAbRGxqjBcpGl0s61a8e7yvxOImBsRDRHRMHTo0CJdzMysGvqU0OYjwAWSJgP9gEPJjlQGSuqTjgzqga2pfStwDNAqqQ9wGNBeEO9Q2KdY/IUKcpiZWQ10e2QSEV+JiPqIGEF2Af3hiPgs8AjwqdSsEbg/LS9O66TtD0dEpPi0dCfWSGAUsAJYCYxKd24dmHIsTn3KzWFmZjVQypFJV74MLJR0NbAauDnFbwZuk9RCdrQwDSAi1km6G3ga2AXMiog3ASR9EVgC1AHzImJdJTnMzKw2tL/8Qd/Q0BBNTU1Ft42Y/eOyxtp07XnV2CUzs72epFUR0dBdO38C3szMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8vNxcTMzHJzMTEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwst1LmgO8naYWkJyWtk/SvKT5S0nJJGyTdlWZJJM2keJeklrR9RMFYX0nxZyRNLIhPSrEWSbML4mXnMDOzPa+UI5M/AOdExMnAGGCSpHHAdcD1ETEKeBGYkdrPAF6MiA8A16d2SBpNNiPiCcAk4DuS6iTVATcC5wKjgc+ktpSbw8zMaqOUOeAjIl5Oq33TI4BzgEUpPh+YmpanpHXS9vGSlOILI+IPEfEs0AKMTY+WiNgYEW8AC4EpqU+5OczMrAZKumaSjiCagW3AUuDXwEsRsSs1aQWGp+XhwBaAtH0HMKQw3qlPV/EhFeTovN8zJTVJamprayvlpZqZWQVKKiYR8WZEjAHqyY4kji/WLD0XO0KIKsZ3l+PdgYi5EdEQEQ1Dhw4t0sXMzKqhrLu5IuIl4FFgHDBQUp+0qR7YmpZbgWMA0vbDgPbCeKc+XcVfqCCHmZnVQCl3cw2VNDAt9wc+BqwHHgE+lZo1Aven5cVpnbT94YiIFJ+W7sQaCYwCVgArgVHpzq0DyS7SL059ys1hZmY10Kf7JhwFzE93XR0A3B0RP5L0NLBQ0tXAauDm1P5m4DZJLWRHC9MAImKdpLuBp4FdwKyIeBNA0heBJUAdMC8i1qWxvlxODjMzq41ui0lErAFOKRLfSHb9pHP8deCiLsa6BrimSPwB4IFq5DAzsz3Pn4A3M7PcXEzMzCw3FxMzM8vNxcTMzHJzMTEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7PcXEzMzCw3FxMzM8vNxcTMzHIrZdreYyQ9Imm9pHWSLk/xwZKWStqQngeluCTdIKlF0hpJpxaM1Zjab5DUWBA/TdLa1OcGSao0h5mZ7XmlHJnsAv4pIo4HxgGzJI0GZgPLImIUsCytA5xLNr/7KGAmcBNkhQGYA5xBNnvinI7ikNrMLOg3KcXLymFmZrXRbTGJiOci4om0vBNYDwwHpgDzU7P5wNS0PAVYEJnHgYGSjgImAksjoj0iXgSWApPStkMj4rGICGBBp7HKyWFmZjVQ1jUTSSPI5oNfDhwREc9BVnCAYanZcGBLQbfWFNtdvLVInApydN7fmZKaJDW1tbWV81LNzKwMJRcTSYcA9wJfiojf765pkVhUEN/t7pTSJyLmRkRDRDQMHTq0myHNzKxSJRUTSX3JCskdEfGDFH6+49RSet6W4q3AMQXd64Gt3cTri8QryWFmZjVQyt1cAm4G1kfENws2LQY67shqBO4viE9Pd1yNA3akU1RLgAmSBqUL7xOAJWnbTknjUq7pncYqJ4eZmdVAnxLafAT4G2CtpOYU+xfgWuBuSTOAzcBFadsDwGSgBXgVuAQgItolfQNYmdp9PSLa0/KlwK1Af+DB9KDcHGZmVhvdFpOI+AXFr1EAjC/SPoBZXYw1D5hXJN4EnFgkvr3cHGZmtuf5E/BmZpabi4mZmeXmYmJmZrm5mJiZWW4uJmZmlpuLiZmZ5eZiYmZmubmYmJlZbi4mZmaWm4uJmZnl5mJiZma5uZiYmVluLiZmZpabi4mZmeXmYmJmZrmVMtPiPEnbJD1VEBssaamkDel5UIpL0g2SWiStkXRqQZ/G1H6DpMaC+GmS1qY+N6TZFivKYWZmtVHKkcmtwKROsdnAsogYBSxL6wDnAqPSYyZwE2SFAZgDnAGMBeZ0FIfUZmZBv0mV5DAzs9rptphExM+B9k7hKcD8tDwfmFoQXxCZx4GBko4CJgJLI6I9Il4ElgKT0rZDI+KxNHvigk5jlZPDzMxqpNJrJkdExHMA6XlYig8HthS0a02x3cVbi8QryfEekmZKapLU1NbWVtYLNDOz0lX7AnyxueKjgnglOd4bjJgbEQ0R0TB06NBuhjUzs0pVWkye7zi1lJ63pXgrcExBu3pgazfx+iLxSnKYmVmNVFpMFgMdd2Q1AvcXxKenO67GATvSKaolwARJg9KF9wnAkrRtp6Rx6S6u6Z3GKieHmZnVSJ/uGkj6PnAWcLikVrK7sq4F7pY0A9gMXJSaPwBMBlqAV4FLACKiXdI3gJWp3dcjouOi/qVkd4z1Bx5MD8rNYWZmtdNtMYmIz3SxaXyRtgHM6mKcecC8IvEm4MQi8e3l5jAzs9rwJ+DNzCw3FxMzM8vNxcTMzHJzMTEzs9xcTMzMLLdu7+ayKvnaYRX02VH9/TAz6wE+MjEzs9xcTMzMLDcXEzMzy83FxMzMcnMxMTOz3FxMzMwsNxcTMzPLzcXEzMxyczExM7Pc9tliImmSpGcktUiaXev9MTPbn+2TX6ciqQ64Efg42ZzwKyUtjoina7tntffh+R8uu8/axrU9sCdmtj/ZV49MxgItEbExIt4AFgJTarxPZmb7rX3yyAQYDmwpWG8FzujcSNJMYGZafVnSM2XmORx44T3jXlfmKBXm4V/V8zkA/e2eyVNlvSlPb3otvS1Pb3otleY5tpRG+2oxKfbbL94TiJgLzK04idQUEQ2V9t+b8vSm19Lb8vSm19Lb8vSm19LTefbV01ytwDEF6/XA1hrti5nZfm9fLSYrgVGSRko6EJgGLK7xPpmZ7bf2ydNcEbFL0heBJUAdMC8i1vVAqopPke2FeXrTa+lteXrTa+lteXrTa+nRPIp4z6UGMzOzsuyrp7nMzGwv4mJiZma5uZiYmVluLia9lKSxkk5Py6Ml/aOkyXsg74KezmHdk3SgpOmSPpbWL5b0bUmzJPWt9f5Z7+ML8HuYpD8l+wT/8oh4uSA+KSJ+UqUcc4Bzye7WW0r27QCPAh8DlkTENVXK0/l2bAFnAw8DRMQF1chTJO+fk32lzlMR8VCVxjwDWB8Rv5fUH5gNnAo8Dfy3iNhRpTyXAfdFxJZuG+fLcwfZv//BwEvAIcAPgPFkP/eNVcz1J8BfkX32axewAfh+td4z2ze4mJRA0iURcUsVxrkMmAWsB8YAl0fE/WnbExFxat4caay1afyDgN8B9QW/JJdHxElVyvME2S/b75F9A4GA75N97oeI+FmV8qyIiLFp+fNk7+F9wATg/0bEtVXIsQ44Od12Phd4FVhE9sv35Ij4ZN4cKc8O4BXg12Tv1T0R0VaNsTvlWRMRJ0nqA/wWODoi3pQk4Mkq/h+4DDgf+BkwGWgGXiQrLn8XEY9WI49Vh6RhEbGtRwaPCD+6eQCbqzTOWuCQtDwCaCIrKACrq7i/q4stp/XmKuY5APgHsqOfMSm2sQfe/8LXsxIYmpbfB6ytUo71BctP9OB7tjq9bxOAm4E24CdAIzCginmeAg4EBgE7gcEp3q/wtVYhz1qgLi0fDDyalt9f5f/ThwHXAr8CtqfH+hQbWO3/c13sw4NVGudQ4L8DtwEXd9r2nSru7+BOjyHApvR/YnC135998kOLPUHSmq42AUdUKU1dpFNbEbFJ0lnAIknHUvz7xir1hqSDI+JV4LSOoKTDgLeqlSQi3gKul3RPen6envkg7AGSBpH9Elakv+Qj4hVJu6qU46mCI9AnJTVERJOkDwJ/rFIOgEjv20PAQ+n6xbnAZ4B/A4ZWKc/NZL9464ArgXskbQTGkX3LdjX1Ad4kOxIeABARm6t8beZustOnZ0XE7wAkHUlWhO8hm44iN0ldnR0Q2dF+NdxCdirwXuC/SLqQrKj8gezfp1peAH7TKTYceILsTMJxVczl01wd0i/CiWSH6O/aBPxnRBxdhRwPA/8YEc0FsT7APOCzEVGXN0ca86D0H7Nz/HDgqIjokQlMJJ0HfCQi/qXK424iK4Ii+yH4s4j4naRDgF9ERO4f8lRo/x34C7IfwlPJvpl6C3BZRDyZN0fKszoiTuliW/+IeK0aedJ4RwNExFZJA8mumW2OiBVVzHE5MAN4HPgocF1E3CJpKHBvRHy0SnmeiYgPlbutgjxvkp2yK/bH3biI6F+FHM2F/2clXUl2ivACYGlU73T3FWT/5v+142de0rMRMbIa478nn4tJRtLNwC0R8Ysi2+6MiIurkKMe2NXxl1WnbR+JiF/mzbE/kXQwcEREPFvFMQeQ/cXWB2iNiOerNXYa/4MR8f+qOWatSToBOJ7shohf9VCOh4CfAvM7/k0kHQH8LfDxiPhYlfI8BfxVRGwosm1LRBxTpFu5OdYDJ6Qj1I5YI/DPZKfBS/rK9xJz1QPXk/1RNIfsellVj0jezuViYmZ7u3SaczbZJHjDUvh5si94vTYiOp9RqDTPp8iuw71n7iNJUyPih1XI8T+AhyLip53ik4D/FRGj8uYokvN8stOdIyLiyGqPDy4mZraPq9bdlntDnp7Mke7m/JOIeKon8riYmNk+TdLmiHh/b8izL78W381lZnu9PXS35R7J05teSyEXEzPbFxzBbu623Mfy9KbX8jYXEzPbF/yI7E6n5s4bJD26j+XpTa/lnTF9zcTMzPLytwabmVluLiZmZpabi4mZmeXmYmJWIklHS1pU6/2ohKQvpa+fMesRvgBvth9IX5bZEBEv1HpfrHfykYlZEZKuk/R3Betfk/RP6YsAkVQn6X9KWilpjaQvpPh3JF2Qlu+TNC8tz5B09W7yTU/jPCnpthQ7VtKyFF8m6f0pfmv6DqmOvi+n57MkPSppkaRfSbpDmcuAo4FHJD1S7ffKDFxMzLqyEPjrgvVPk03M1WEGsCMiTgdOBz4vaSTwc7KvsYds7ojRafnPgf8olih96+6VwDkRcTJwedr0bWBBZLMi3gHcUMJ+nwJ8KeU9jmxKgBuArcDZEXF2CWOYlc3FxKyIiFgNDEvXSU4m+xTx5oImE4DpkpqB5WSz2I0iKxh/IWk02ZTGz0s6CjiTrj91fA6wqOMUVES0p/iZwJ1p+TaygtSdFRHRmr7evJlsRk+zHudPwJt1bRHwKeBI3js7oYC/j4glnTulr0ufRHaUMpjsqObliNjZRZ6OSb+609FmF+kPQUkim563Q+GkaG/in3HbQ3xkYta1hcA0soLS+S6uJcClHVPTSvqgpPelbY+RnWr6OdmRyhV0cYorWQZ8WtKQNNbgFP/PlB/gs0DHxG2beGc65ilAKdPj7iRNqWvWE1xMzLoQEevIfgH/NiKe67T5e2SnsZ5IF+X/N+8cBfwH0CciWsjm2x7MbopJynMN8DNJTwLfTJsuAy5J3/76N7xzLeX/AH8paQVwBvBKCS9nLvCgL8BbT/GtwWZmlpuPTMzMLDdfnDPbQ9I1kWVFNo2PiO17en/MqsmnuczMLDef5jIzs9xcTMzMLDcXEzMzy83FxMzMcvv/tSMJiTE29GMAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "page_views_by_user.iloc[:20, :].plot.bar(\"view_count\", \"num_users\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of unique users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "users_distinct_count = page_views_by_user.num_users.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Average page views per user "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0867025734966909"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(page_views_by_user.num_users * page_views_by_user.view_count).sum()/users_distinct_count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cumulative Percentual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>view_count</th>\n",
       "      <th>num_users</th>\n",
       "      <th>cum_percentual</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1</td>\n",
       "      <td>8546933</td>\n",
       "      <td>0.928798</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2</td>\n",
       "      <td>549587</td>\n",
       "      <td>0.988521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>80770</td>\n",
       "      <td>0.997299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4</td>\n",
       "      <td>16394</td>\n",
       "      <td>0.999080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>6165</td>\n",
       "      <td>0.999750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>90</td>\n",
       "      <td>0.999997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10</td>\n",
       "      <td>22</td>\n",
       "      <td>0.999999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>13 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    view_count  num_users  cum_percentual\n",
       "11           1    8546933        0.928798\n",
       "9            2     549587        0.988521\n",
       "1            3      80770        0.997299\n",
       "8            4      16394        0.999080\n",
       "4            5       6165        0.999750\n",
       "..         ...        ...             ...\n",
       "0            9         90        0.999997\n",
       "12          10         22        0.999999\n",
       "6           11          6        1.000000\n",
       "10          12          2        1.000000\n",
       "7           14          1        1.000000\n",
       "\n",
       "[13 rows x 3 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views_by_user[\"cum_percentual\"] = page_views_by_user.num_users.cumsum()/users_distinct_count\n",
    "page_views_by_user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f19c94b3940>"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAELCAYAAAA1AlaNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xt81PWd7/HXJ3duAklQkSABS1lRENyIttTiZb2tVuqlVmu1dKt2q6yXXbsHH5xjLV1WWzk9XmprqUuRHqtrOV1rLV11KdhuayuxAnIRRZxAwGoykQAZQi7zOX/ML2EYEjIhCUPm934+Hnnkd/n+Zj4T5Z1vvvOd78/cHRERCYecTBcgIiJHjkJfRCREFPoiIiGi0BcRCRGFvohIiCj0RURCRKEvIhIiCn0RkRBR6IuIhEhepgtIVVpa6uXl5ZkuQ0SkX3n99ddr3X1EV+2OutAvLy+nsrIy02WIiPQrZlaVTjsN74iIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIh0GfpmtsjMPjSzdZ2cNzN7xMw2m9laMzs96dyXzOyd4OtLvVm4iIh0Xzo9/cXAxYc4fwkwPvi6BfgBgJkVA98AzgSmAd8ws+E9KVZERHqmy3n67v5bMys/RJOZwBJP3Hfxj2Y2zMxGAucAL7t7HYCZvUzil8fTPS1askc87rTEnZZ4nOZWp6U1TmvcaY4ntptbHXcn7uA48Xjiuzu4Q9yduDsOuHtwjP3XBOfi7eeCa9sfa3+b/XcObWtDe1vatxPt21v6gfUkrt7/mPsbHvi6k88m37E09ealB57r+Jp0HFU3RdUtWjtVOriQSyaN7NPn6I0PZ40CtiXtVwfHOjt+EDO7hcRfCZx44om9UJL0ln0treyMNRPd08RHsSbqGvZ/fRRrItrQxK69zTS3hXVrIsBbWv3Qx4JQj+vfv0i7KaOH9YvQtw6O+SGOH3zQfSGwEKCiokIx0EficWdXY/P+wG4P8mbqGvbt/x5r5qMg2Pfsa+n08YYNzKd4YAHHDMinIDeHvJwcivKN/Nwc8nIS33NzjLxcIz8nh7xcIy/HyMvN6fhYcE37seB8jrV9gRmYGQaJYzlgWPvxHEvs57S1s0S7xPcDrzXbfy3Qvt++nXo+OE7b89F2nbWfS74++Xtbu2TJewe0S/mnc+C5Th4gDamPm0l29JRyVMnL6fsfTG+EfjUwOmm/DNgRHD8n5fjKXng+6UI87myp3cPqbfWs2baTtdU72b5zLx/FmmntpGtdmJdDyaACigcXMHxgAeUlAykeVEDxwMSx4oEFDB9UQMmgxPdhA/LJy9XkL5H+pjdC/3lgtpk9Q+JN23p3f9/MXgT+NenN2wuBe3rh+STFX+obWb1tJ2uqd7Jm207erK5nd9BDH1SQy+SyYVww8fj2wC4elE/xoMIgyPMpGVTIgILcDL8KETkSugx9M3uaRI+91MyqSczIyQdw98eBZcDfApuBGPDl4FydmX0LWBU81Ly2N3Xl8O1ubObN6npWBwG/Zls9f9nVCCT+NDx55DHMnHoCp5UNY8roYYwbMZjcI/Ano4j0D+nM3rmui/MO3NbJuUXAosMrTZpa4rz1l12s2bYzMVRTvZN3a/a0T34oLxnImeOKOa1sGKeNHsYpJxxDUb567CLSuaNuaeWwcnci0VgQ8ImvDTt20dQaB6BkUAFTRg/j8tNO4LTRw5g8aijDBxVkuGoR6W8U+keB32+u5Y5n3qB2TxMAA/JzmVQ2lFnTy4Ne/FBGDRtw0OwPEZHuUuhn2KpIHTc9WUnZ8AHcfeEEThs9jPHHDtbMGBHpEwr9DFqzbSdf/vEqRg4t4qc3n8WIIYWZLklEspy6kxmyYccublz0GsMH5fPUzWcq8EXkiFDoZ8DmD3dzw7/9iYEFufz0prMYOXRApksSkZBQ6B9hkdoGvvCjP2FmPHXTmYwuHpjpkkQkRBT6R9D2nXu5/ok/0dwa56mbzmTciMGZLklEQkZv5B4hH+xq5As/+iO7Gpt5+uazmHD8kEyXJCIhpJ7+ERDds4/rn/gTtbv38eTfTePUUUMzXZKIhJR6+n2sPtbMF//tNbbVxXjy76Zx+om6eZiIZI56+n1od2MzN/74Nd79cA8Lb6zgrHElmS5JREJOPf0+Emtq4SuLK1m3vZ4fXH86Mz4+ItMliYiop98XGptbuWXJ61RW1fHQ56dw4SnHZ7okERFAPf1e19QS57an/sx/b67lwasn85nTTsh0SSIi7dTT70UtrXHu+vfVLH/rQ7712VP5XMXori8SETmCFPq9JB53/nnpWn715vv8z0tP5oazxmS6JBGRgyj0e4G7M/e5dfz8je380wUf56azx2W6JBGRDqUV+mZ2sZltMrPNZjang/NjzGy5ma01s5VmVpZ07ttmti74+nxvFn80cHfmvbCBp1/byq3nnMTs8z6W6ZJERDrVZeibWS7wGHAJMBG4zswmpjRbACxx98nAPOD+4NpLgdOBKcCZwNfN7JjeKz/zFry0iR//PsKXp5fz9Ysm6O5WInJUS6enPw3Y7O5b3L0JeAaYmdJmIrA82F6RdH4i8Iq7t7h7A7AGuLjnZR8dvvebd3hsxbtcN+1E7r1sogJfRI566YT+KGBb0n51cCzZGuCqYPsKYIiZlQTHLzGzgWZWCpwLZMWUlid+t4UFL73NlVNHMf+zpyrwRaRfSCf0O0ozT9m/G5hhZm8AM4DtQIu7vwQsA/4APA28CrQc9ARmt5hZpZlV1tTUdKf+jPjJH6v4l19t5G8nHc93rp5MTo4CX0T6h3RCv5oDe+dlwI7kBu6+w92vdPepwNzgWH3wfb67T3H3C0j8Ankn9QncfaG7V7h7xYgRR/dyBUtfr+Z/PbeO8//qWB76/FTdwFxE+pV0EmsVMN7MxppZAXAt8HxyAzMrNbO2x7oHWBQczw2GeTCzycBk4KXeKv5I++WaHfzz0jV86mOlPHb96RTkKfBFpH/pchkGd28xs9nAi0AusMjd15vZPKDS3Z8HzgHuNzMHfgvcFlyeD/wuGO/eBXzR3Q8a3ukPVm76kLv+fTUVY4pZeONfU5Sfm+mSRES6zdxTh+czq6KiwisrKzNdxkFuXPQa79XuYdntZzOkKD/T5YiIHMDMXnf3iq7aaXwiTVXRBqaMHq7AF5F+TaGfhubWONUf7aW8ZGCmSxER6RGFfhq2f7SX1rgzpmRQpksREekRhX4a3os2AKinLyL9nkI/DVW1idBXT19E+juFfhoi0RiDCnIpHVyQ6VJERHpEoZ+GqmgD5aWDtL6OiPR7Cv00VEVjlGtoR0SygEK/Cy2tcbZ9FGOM3sQVkSyg0O/Cjp2NNLe6evoikhUU+l2IRNtm7qinLyL9n0K/C1Vtc/RL1dMXkf5Pod+FSDTGgPxcjh1SmOlSRER6TKHfhapoA2NKBmq6pohkBYV+FyKarikiWUShfwitcWdrNMaYUr2JKyLZQaF/CO/X76WpNa6evohkDYX+IVRFY4Cma4pI9lDoH0KkfUll9fRFJDukFfpmdrGZbTKzzWY2p4PzY8xsuZmtNbOVZlaWdO47ZrbezDaa2SPWj6bBVEVjFOblcPwxRZkuRUSkV3QZ+maWCzwGXAJMBK4zs4kpzRYAS9x9MjAPuD+49pPAdGAycCpwBjCj16rvY5HaxHTNnJx+83tKROSQ0unpTwM2u/sWd28CngFmprSZCCwPtlcknXegCCgACoF84IOeFn2kVEVjunGKiGSVdEJ/FLAtab86OJZsDXBVsH0FMMTMStz9VRK/BN4Pvl509409K/nIiMedqroG3SJRRLJKOqHf0diGp+zfDcwwszdIDN9sB1rM7GPAyUAZiV8U55nZpw96ArNbzKzSzCpramq69QL6yge7G2lsjqunLyJZJZ3QrwZGJ+2XATuSG7j7Dne/0t2nAnODY/Ukev1/dPc97r4H+DVwVuoTuPtCd69w94oRI0Yc5kvpXZHaxHRNzdwRkWySTuivAsab2VgzKwCuBZ5PbmBmpWbW9lj3AIuC7a0k/gLIM7N8En8F9IvhnSotqSwiWajL0Hf3FmA28CKJwH7W3deb2Twzuzxodg6wyczeBo4D5gfHlwLvAm+SGPdf4+6/7N2X0Dci0RgFuTmcMGxApksREek1eek0cvdlwLKUY/cmbS8lEfCp17UCX+1hjRlRFW1gdPEAcjVdU0SyiD6R2wmtriki2Uih3wF3D9bRV+iLSHZR6HegZvc+Yk2tlGtJZRHJMgr9DkTaV9dUT19EsotCvwNtq2uOVeiLSJZR6HegKtpAXo5xwjCtriki2UWh34FINMbo4oHk5erHIyLZRanWgcTMHb2JKyLZR6Gfwt2J1GqOvohkJ4V+imhDE3v2tainLyJZSaGfokr3xRWRLKbQT9G+pHKpQl9Eso9CP0VVtIHcHGOUVtcUkSyk0E8RicYYNWwABXn60YhI9lGypYhouqaIZDGFfhJ3573aBr2JKyJZS6GfZGesmd2Nmq4pItlLoZ8koumaIpLlFPpJqqKariki2S2t0Dezi81sk5ltNrM5HZwfY2bLzWytma00s7Lg+Llmtjrpq9HMPtvbL6K3RKINmMHoYk3XFJHs1GXom1ku8BhwCTARuM7MJqY0WwAscffJwDzgfgB3X+HuU9x9CnAeEANe6sX6e1VVNMYJQwdQmJeb6VJERPpEOj39acBmd9/i7k3AM8DMlDYTgeXB9ooOzgNcDfza3WOHW2xfe6+2QbdIFJGslk7ojwK2Je1XB8eSrQGuCravAIaYWUlKm2uBpw+nyCNFN0MXkWyXTuhbB8c8Zf9uYIaZvQHMALYDLe0PYDYSmAS82OETmN1iZpVmVllTU5NW4b2tPtbMR7FmyjVdU0SyWDqhXw2MTtovA3YkN3D3He5+pbtPBeYGx+qTmlwD/Ie7N3f0BO6+0N0r3L1ixIgR3XoBvaWqTtM1RST7pRP6q4DxZjbWzApIDNM8n9zAzErNrO2x7gEWpTzGdRzlQzsRTdcUkRDoMvTdvQWYTWJoZiPwrLuvN7N5ZnZ50OwcYJOZvQ0cB8xvu97Mykn8pfBKr1bey6pqEz39E4s1vCMi2SsvnUbuvgxYlnLs3qTtpcDSTq6NcPAbv0edSDTGyKFFFOVruqaIZC99Ijeg1TVFJAwU+oGqqFbXFJHsp9AHdjc2U7unSXP0RSTrKfTZv9DaWH0aV0SynEKf/aGvnr6IZDuFPvvX0dcbuSKS7RT6QKS2gWOHFDKwIK0ZrCIi/ZZCn8TwjmbuiEgYKPTRHH0RCY/Qh36sqYUPd+/TmjsiEgqhD/32++JqeEdEQkChr5k7IhIioQ/9SPscfYW+iGQ/hX5tA6WDCxhSlJ/pUkRE+pxCX/fFFZEQCX3oV0VjGtoRkdAIdeg3Nrfyfn0jY9XTF5GQCHXob60L3sTVHH0RCYlQh34kuC9uuYZ3RCQk0gp9M7vYzDaZ2WYzm9PB+TFmttzM1prZSjMrSzp3opm9ZGYbzWxDcKP0o0L76prF6umLSDh0Gfpmlgs8BlwCTASuM7OJKc0WAEvcfTIwD7g/6dwS4EF3PxmYBnzYG4X3hkg0xvCB+QwdqOmaIhIO6fT0pwGb3X2LuzcBzwAzU9pMBJYH2yvazge/HPLc/WUAd9/j7rFeqbwXVGm6poiETDqhPwrYlrRfHRxLtga4Kti+AhhiZiXAx4GdZvZzM3vDzB4M/nI4gJndYmaVZlZZU1PT/VdxmCK1MY3ni0iopBP61sExT9m/G5hhZm8AM4DtQAuQB5wdnD8DGAfMOujB3Be6e4W7V4wYMSL96ntgX0srO+r3anVNEQmVdEK/GhidtF8G7Ehu4O473P1Kd58KzA2O1QfXvhEMDbUAzwGn90rlPbStbi/uWl1TRMIlndBfBYw3s7FmVgBcCzyf3MDMSs2s7bHuARYlXTvczNq67+cBG3peds9pdU0RCaMuQz/ooc8GXgQ2As+6+3ozm2dmlwfNzgE2mdnbwHHA/ODaVhJDO8vN7E0SQ0U/6vVXcRjea5+jr56+iIRHWncCd/dlwLKUY/cmbS8FlnZy7cvA5B7U2CeqojGOKcpjmKZrikiIhPYTuZFoA+WlgzDr6H1qEZHsFNrQT6yuqaEdEQmXUIZ+U0uc6o9ijNWbuCISMqEM/e079xJ31NMXkdAJZei3r65Zqp6+iIRLOEO/fY6+evoiEi6hDP2qaIzBhXmUDCrIdCkiIkdUKEM/cTP0gZquKSKhE8rQr4rGtNCaiIRS6EK/pTXOtjotqSwi4RS60N+xs5GWuOtNXBEJpdCF/ntRLbQmIuEVutCvag99De+ISPiELvQjtTEG5OcyYkhhpksRETniQhf6VZquKSIhFrrQj0QbGKvpmiISUqEK/da4s61ur2buiEhohSr036/fS1NrXG/iikhopRX6ZnaxmW0ys81mNqeD82PMbLmZrTWzlWZWlnSu1cxWB1/Pp157JEVqY4AWWhOR8OryHrlmlgs8BlwAVAOrzOx5d9+Q1GwBsMTdnzSz84D7gRuCc3vdfUov131Y2lbX1JLKIhJW6fT0pwGb3X2LuzcBzwAzU9pMBJYH2ys6OH9UqIo2UJiXw3FDijJdiohIRqQT+qOAbUn71cGxZGuAq4LtK4AhZlYS7BeZWaWZ/dHMPtujansoEo0xpmQgOTmariki4ZRO6HeUkJ6yfzcww8zeAGYA24GW4NyJ7l4BfAF4yMxOOugJzG4JfjFU1tTUpF99N1VFG7T8goiEWjqhXw2MTtovA3YkN3D3He5+pbtPBeYGx+rbzgXftwArgampT+DuC929wt0rRowYcTivo0vxuGtJZREJvXRCfxUw3szGmlkBcC1wwCwcMys1s7bHugdYFBwfbmaFbW2A6UDyG8BHzF92NbKvJc4YTdcUkRDrMvTdvQWYDbwIbASedff1ZjbPzC4Pmp0DbDKzt4HjgPnB8ZOBSjNbQ+IN3gdSZv0cMRGtriki0vWUTQB3XwYsSzl2b9L2UmBpB9f9AZjUwxp7RVW0bY6+evoiEl6h+URuJNpAQW4OI4cOyHQpIiIZE5rQr6qNcWLJQHI1XVNEQiw0oR+JNmjNHREJvVCEvntiuqbW3BGRsAtF6H+4ex97m1vV0xeR0AtF6EdqE9M11dMXkbALRei3TdfUHH0RCbtQhH4k2kBejnHCMK2uKSLhForQr4rGOLF4IHm5oXi5IiKdCkUKRqIN+iSuiAghCH13J1LboDdxRUQIQejX7mmioUnTNUVEIAShXxWsrjlG6+iLiGR/6Ec0XVNEpF3Wh35VtIHcHKNsuFbXFBHJ+tCPRGOUDR9AvqZriohkf+hXRTVzR0SkTVaHvrvzXq2WVBYRaZPVof9RrJndjS3q6YuIBNIKfTO72Mw2mdlmM5vTwfkxZrbczNaa2UozK0s5f4yZbTez7/VW4enYfzN09fRFRCCN0DezXOAx4BJgInCdmU1MabYAWOLuk4F5wP0p578FvNLzcrunfY6+evoiIkB6Pf1pwGZ33+LuTcAzwMyUNhOB5cH2iuTzZvbXwHHASz0vt3sitTFyDEYXa7qmiAikF/qjgG1J+9XBsWRrgKuC7SuAIWZWYmY5wP8Gvt7TQg9HVbSBE4YNoDAvNxNPLyJy1Ekn9K2DY56yfzcww8zeAGYA24EW4FZgmbtv4xDM7BYzqzSzypqamjRKSk8kGtMncUVEkuSl0aYaGJ20XwbsSG7g7juAKwHMbDBwlbvXm9kngLPN7FZgMFBgZnvcfU7K9QuBhQAVFRWpv1AOWyTawKWTRvbWw4mI9HvphP4qYLyZjSXRg78W+EJyAzMrBercPQ7cAywCcPfrk9rMAipSA7+v7Iw1sTPWrJ6+iEiSLod33L0FmA28CGwEnnX39WY2z8wuD5qdA2wys7dJvGk7v4/qTVvbfXF18xQRkf3S6enj7suAZSnH7k3aXgos7eIxFgOLu13hYWqfo68llUVE2mXtJ3KrojHM4MRi9fRFRNpkbehHog2MPKaIonxN1xQRaZO9oa/74oqIHCRrQ78qGqO8VEM7IiLJsjL0dzU2E21oUk9fRCRFVob+1vb74qqnLyKSLCtDP6LVNUVEOpSVoa8PZomIdCwrQz9S28BxxxQysCCtz56JiIRGdoa+boYuItKhLA39mN7EFRHpQNaFfsO+Fmp271NPX0SkA1k36F3VPl1ToS/Sprm5merqahobGzNdivRQUVERZWVl5OfnH9b1WRj6bdM1Nbwj0qa6upohQ4ZQXl6OWUc3w5P+wN2JRqNUV1czduzYw3qMrBveibT19LWkski7xsZGSkpKFPj9nJlRUlLSo7/Ysi/0axsoHVzI4MKs+yNGpEcU+Nmhp/8dsy/0ow2auSMi0omsC/2qaEwzd0Qko3bu3Mn3v//9Hj3GrFmzWLr0kDckPCxZFfp7m1r5y65G9fRFpNtaW1t77bF6I/T7SloD32Z2MfAwkAs84e4PpJwfAywCRgB1wBfdvTo4/vPgunzgUXd/vBfrP8DWumDNHb2JK9Kpb/5yPRt27OrVx5x4wjF84zOnHLLNkiVLWLBgAWbG5MmTyc3N5bLLLuPqq68GYPDgwezZs4eVK1fyjW98g+OOO47Vq1dz5ZVXMmnSJB5++GH27t3Lc889x0knndThc8yaNYuioiLWr1/PBx98wHe/+10uu+wyWltbmTNnDitXrmTfvn3cdtttfPWrX2XlypV885vfZOTIkaxevZoNGzYcVOdPfvITampq+Pu//3u2bt0KwEMPPcT06dO577772Lp1K1u2bGHr1q3ceeed3H777cyZM4d3332XKVOmcMEFF3DppZeyYMECXnjhBQBmz55NRUUFs2bNYt68efzyl79k7969fPKTn+SHP/xhn77/0mXom1ku8BhwAVANrDKz5919Q1KzBcASd3/SzM4D7gduAN4HPunu+8xsMLAuuHZHr78S9q+uOVbDOyJHlfXr1zN//nx+//vfU1paSl1dHf/4j//Yafs1a9awceNGiouLGTduHDfddBOvvfYaDz/8MI8++igPPfRQp9dGIhFeeeUV3n33Xc4991w2b97MkiVLGDp0KKtWrWLfvn1Mnz6dCy+8EIDXXnuNdevWMXbs2A7rBLjjjju46667+NSnPsXWrVu56KKL2LhxIwBvvfUWK1asYPfu3UyYMIGvfe1rPPDAA6xbt47Vq1cDsHLlyk7rnT17Nvfeey8AN9xwAy+88AKf+cxnuvXz7Y50evrTgM3uvgXAzJ4BZgLJoT8RuCvYXgE8B+DuTUltCunj4aS2OfonanhHpFNd9cj7wm9+8xuuvvpqSktLASguLj5k+zPOOIORI0cCcNJJJ7UH9KRJk1ixYsUhr73mmmvIyclh/PjxjBs3jrfeeouXXnqJtWvXto+R19fX884771BQUMC0adPa57x3Vud//dd/sWHD/sjbtWsXu3fvBuDSSy+lsLCQwsJCjj32WD744INu/WxWrFjBd77zHWKxGHV1dZxyyikZD/1RwLak/WrgzJQ2a4CrSAwBXQEMMbMSd4+a2WjgV8DHgK/3VS8f4L3aGMWDChg64PA+qSYifcPdDxqyyMvLIx6Pt59vatrfRywsLGzfzsnJad/PycmhpaXlkM+V+jxmhrvz6KOPctFFFx1wbuXKlQwatH9koKM6AeLxOK+++ioDBgw46Fxyrbm5uR3Wl/xagfZ59o2Njdx6661UVlYyevRo7rvvvj7/1HQ6Pe+OBpc8Zf9uYIaZvQHMALYDLQDuvs3dJ5MI/S+Z2XEHPYHZLWZWaWaVNTU13XoByaqiDfokrshR6Pzzz+fZZ58lGo0CUFdXR3l5Oa+//joAv/jFL2hubu6V5/rZz35GPB7n3XffZcuWLUyYMIGLLrqIH/zgB+3P8fbbb9PQ0JBWnQAXXngh3/ve99rbtQ3bdGbIkCHtfwkAjBkzhg0bNrBv3z7q6+tZvnw5sD/8S0tL2bNnT5/M1kmVTk+/GhidtF8GHNBbD3rvVwIEY/dXuXt9ahszWw+cDSxNObcQWAhQUVGR+gslbVXRGNPGHvrPRhE58k455RTmzp3LjBkzyM3NZerUqXz7299m5syZTJs2jfPPP/+AHndPTJgwgRkzZvDBBx/w+OOPU1RUxE033UQkEuH000/H3RkxYgTPPfdcWnUuXryYRx55hNtuu43JkyfT0tLCpz/9aR5/vPM5KSUlJUyfPp1TTz2VSy65hAcffJBrrrmGyZMnM378eKZOnQrAsGHDuPnmm5k0aRLl5eWcccYZvfIzOBRzP3TGmlke8DZwPoke/CrgC+6+PqlNKVDn7nEzmw+0uvu9ZlYGRN19r5kNB/5E4hfCm509X0VFhVdWVnb7hTQ2t3Lyvf/JHeeP586/+Xi3rxfJZhs3buTkk0/OdBl9btasWQfMCMpWHf33NLPX3b2iq2u7HN5x9xZgNvAisBF41t3Xm9k8M7s8aHYOsMnM3gaOA+YHx08G/mRma4BXgAWHCvye2N3YwlljS5g48pi+eHgRkayQ1jx9d18GLEs5dm/S9lJShmyC4y8Dk3tYY1pGDCnk6VvOOhJPJSIZNn/+fH72s58dcOxzn/scixcvzkxB/YhWJRORfmfu3LnMnTs302X0S1m1DIOIdK6r9++kf+jpf0eFvkgIFBUVEY1GFfz9XNtNVIqKig77MTS8IxICZWVlVFdX05PPwcjRoe12iYdLoS8SAvn5+Yd9ez3JLhreEREJEYW+iEiIKPRFREKky2UYjjQzqwGqMl1HJ0qB2kwXcZhUe2b019r7a90Q3trHuPuIrhoddaF/NDOzynTWtjgaqfbM6K+199e6QbV3RcM7IiIhotAXEQkRhX73LMx0AT2g2jOjv9beX+sG1X5IGtMXEQkR9fRFREJEoZ8GMxttZivMbKOZrTezOzJdU3eYWa6ZvWFmL2S6lu4ws2FmttTM3gp+9p/IdE3pMrO7gv9X1pnZ02Z2+Ctk9TEzW2RmH5rZuqRjxWb2spm9E3wfnskaO9NJ7Q8G/8+sNbP/MLNhmayxMx3VnnTubjPz4K6EvUqhn54W4J/c/WTgLOA2M5uY4Zq64w4Sdz3rbx4YwpQvAAAFNklEQVQG/tPd/wo4jX7yGsxsFHA7UOHupwK5wLWZreqQFgMXpxybAyx39/HA8mD/aLSYg2t/GTjV3SeTuNXrPUe6qDQt5uDaMbPRwAXA1r54UoV+Gtz9fXf/c7C9m0T4jMpsVekJ7lN8KfBEpmvpDjM7Bvg08G8A7t7k7jszW1W35AEDgntMDwR2ZLieTrn7b4G6lMMzgSeD7SeBzx7RotLUUe3u/lJwm1eAPwKHvyRlH+rk5w7wf4B/BvrkDVeFfjeZWTkwlcRN3vuDh0j8DxTPdCHdNA6oAX4cDE09YWaDMl1UOtx9O7CARE/tfaDe3V/KbFXddpy7vw+JTg9wbIbrOVx/B/w600WkK7jv+HZ3X9NXz6HQ7wYzGwz8P+BOd9+V6Xq6YmaXAR+6++uZruUw5AGnAz9w96lAA0fvEMMBgvHvmcBY4ARgkJl9MbNVhY+ZzSUxNPtUpmtJh5kNBOYC93bVticU+mkys3wSgf+Uu/880/WkaTpwuZlFgGeA88zs/2a2pLRVA9Xu3vYX1VISvwT6g78B3nP3GndvBn4OfDLDNXXXB2Y2EiD4/mGG6+kWM/sScBlwvfefeeknkegorAn+zZYBfzaz43vzSRT6aTAzIzG2vNHdv5vpetLl7ve4e5m7l5N4I/E37t4vepzu/hdgm5lNCA6dD2zIYEndsRU4y8wGBv/vnE8/eRM6yfPAl4LtLwG/yGAt3WJmFwP/A7jc3WOZridd7v6mux/r7uXBv9lq4PTg30KvUeinZzpwA4me8urg628zXVQI/APwlJmtBaYA/5rhetIS/HWyFPgz8CaJf2dH7adEzexp4FVggplVm9lXgAeAC8zsHRIzSR7IZI2d6aT27wFDgJeDf6uPZ7TITnRSe98/b//5y0dERHpKPX0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIgo9CXrmNkJZrY003UcDjO7M/g4vkif0Dx9kaNI8PH7CnevzXQtkp3U05d+zcy+bWa3Ju3fZ2b/1HZjiuAGMg+a2argphpfDY5/P1jRkOBGG4uC7a+Y2b8c4vluDB5njZn9JDg2xsyWB8eXm9mJwfHFZnZ10rV7gu/nmNnKpBvEPGUJt5NYoG2Fma3o7Z+VCCj0pf97Bvh80v41wKqk/a+QWNr4DOAM4GYzGwv8Fjg7aDMKaLspzqeA33X0RGZ2ColVEM9z99NI3JwGEh/7XxLctOMp4JE06p4K3Bk87zhgurs/QmLd/XPd/dw0HkOk2xT60q+5+xvAscE4/mnARxx4x6ELgRvNbDWJeyCUAONJBPvZwR3QNrB/VclPAH/o5OnOA5a2Db24e9sNMD4B/DTY/gmJXxxdec3dq909DqwGytN5vSI9lZfpAkR6wVLgauB4Ej3/ZAb8g7u/mHpRsO79xSR6/cUk/krYE9wdrSNGenczamvTQtCxClbbLEhqsy9puxX9W5QjRD19yQbPkFg6+moSvwCSvQh8LbgfAmb28aQ7cL1KYojltyR6/nfTydBOYDlwjZmVBI9VHBz/A/vvgXs98N/BdgT462B7JpCfxmvZTWKFSJE+odCXfs/d15MIyu1tt/hL8gSJ4Zs/B2/u/pD9verfAXnuvpnEMsjFHCL0g+eZD7xiZmuAtnsr3A58OVgC+gb2j/X/CJhhZq8BZ5K4+1dXFgK/1hu50lc0ZVNEJETU0xcRCRG9eSSSIhizX97BqfPdPXqk6xHpTRreEREJEQ3viIiEiEJfRCREFPoiIiGi0BcRCRGFvohIiPx/JrOisdnGLTIAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "page_views_by_user.iloc[:20, :].plot.line(x = \"view_count\", y = \"cum_percentual\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>view_count</th>\n",
       "      <th>num_users</th>\n",
       "      <th>cum_percentual</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4</td>\n",
       "      <td>16394</td>\n",
       "      <td>0.999080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>6165</td>\n",
       "      <td>0.999750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>1453</td>\n",
       "      <td>0.999908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7</td>\n",
       "      <td>479</td>\n",
       "      <td>0.999960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8</td>\n",
       "      <td>247</td>\n",
       "      <td>0.999987</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>90</td>\n",
       "      <td>0.999997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10</td>\n",
       "      <td>22</td>\n",
       "      <td>0.999999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    view_count  num_users  cum_percentual\n",
       "8            4      16394        0.999080\n",
       "4            5       6165        0.999750\n",
       "2            6       1453        0.999908\n",
       "3            7        479        0.999960\n",
       "5            8        247        0.999987\n",
       "0            9         90        0.999997\n",
       "12          10         22        0.999999\n",
       "6           11          6        1.000000\n",
       "10          12          2        1.000000\n",
       "7           14          1        1.000000"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views_by_user.tail(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Page views by platform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>platform</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4403345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4678799</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>917855</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            count\n",
       "platform         \n",
       "1         4403345\n",
       "2         4678799\n",
       "3          917855"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views_by_platform = page_views.groupBy(\"platform\")\\\n",
    ".count().toPandas().set_index(\"platform\").sort_index()\n",
    "page_views_by_platform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,1,'Page views by platform')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAecAAAHUCAYAAAAA4+BXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3XeYlNXZBvD7mdnOwsDCwi51lL70IqJ00UQzChZsRINKjJrERKOfGTUJq4lmjMEYNcUWu6jEEnQUK1VQBCxLF2EWlmU729uU8/3xzsLuumWWLWfK/buuuWDeOXPeewfdZ855yxGlFIiIiCh4mHQHICIiooZYnImIiIIMizMREVGQYXEmIiIKMizOREREQYbFmYiIKMiwOBN1EhGZJSJ7Ne37GhHZ2AX7sYqIEpGok3y/iMgzInJMRLZ0dD6iUMXiTCFLRFwiUiUi5SKS6/8ln6g7Vx2l1Aal1EjdOYKFiMwVkaxGm2cCOAfAQKXUNA2xiIISizOFuguUUokAJgM4DcDvNOehthkCwKWUqmjrG092tE4UClicKSwopY4AeA/AWAAQkWtFZLeIlInIARG5oX57EblDRI6KSLaI/NQ/NTvM/1qsiPxVRA75R+T/FpH4xvv0tysWkbH1tiX7R/N9G48URaS/iLwuIvkiclBEfuXfHud/Tx//89+JiEdEevif/0lEHvb//Ucissv/cx0Rkdtb+FhERB4VkRIR2SMi8/0bLxWRbY0a3iYibzXTyVoR+bOIbPH39T8RSWqmbZOfu4h08//79PfPdJSLyN0AngJwhv/5Pf6214vIfhEpEpFVItK/Xv9KRH4hIt8C+Lbetp+LyLf+/f5RRIaKyGYRKRWR10QkpoXPiSjosDhTWBCRQQB+BOBL/6Y8AOcD6AHgWgB/E5HJ/rbnAvgNgLMBDAMwp1F3DwAYAWCi//UBAP7QeJ9KqRoAbwC4st7mywCsU0rlNcpnAvA2gK/9/c0HcIuI/FApVQ3gi3o5ZgPIBDCj3vN1/r8/DeAGpVR3GF9EPmnhYzkdwAEAfQAsA/CGv6iuAnCKiIyu1/YqAC+00NdPAFwHoD8AD4BHmmnX5OfuHxmfByBbKZXof9wH4EYAm/3Pl4nIWQD+DONzTPV/Dq802seF/p8trd62cwFMATAdwB0AngDwYwCDYHxOV4IohLA4U6h7S0SKAWyEUcDuBwCllFMp9Z0yrAPwAYBZ/vdcBuAZpdROpVQlgHvqOhMRAXA9gFuVUkVKqTJ/n1c0s/+X0fAX/2L/tsZOA5CslLpXKVWrlDoA4Ml6/a4DMMc/VTseRvGbIyJx/vdu8LdzA0gTkR5KqWNKqe0tfDZ5AB5WSrmVUq8C2AvA5v9S8SqMggwRGQPACuCdFvp6QSm1w19kfw/gMhExN27UyuceiB8D+I9Sars/550wRtbWem3+7P+3qaq37QGlVKlSaieAHQA+UEodUEqVwBixT2pDBiLtWJwp1F2olOqplBqilPp53S9sETlPRD7zT40WwxhV9/G/pz+Aw/X6qP/3ZAAJALb5p6yLAaz2b2/KJwDiReR0ERkCY7T9ZhPthsCY0i2u1+9dAPr5X18HYC6MY+cZAD6EMZKeDmC/UqrA3+4S/8+SKSLrROSMFj6bI6rhyjaZ/p8dAJ4DsNj/ZeRqAK/5i2Fz6n9GmQCiceLzPK6Vzz0Q/f39AwCUUuUACmHMNjSVpU5uvb9XNfE8aE4UJAoET6igsCMisQBehzEV+z+llNt/PFX8TY4CGFjvLYPq/b0Axi/zMf7j2C1SSvlE5DUYo+dcAO/4R9uNHQZwUCk1vJmuNgEYCeAiGNPiu0RkMAAbTkxpQyn1BYCFIhIN4JcAXmuUv74BIiL1CvRgGFPaUEp9JiK1MEa1i/2PltTfx2AYI/iC+tsD+NwDWQIvG8YXmbo+uwHoDaD+vwWX0qOwx5EzhaMYALEA8gF4ROQ8AD+o9/prAK4VkdEikoB6x5OVUj4Y081/E5G+ACAiA0Tkhy3s72UAl8OYkm1qShsAtgAoFZHfiki8iJhFZKyInObfbyWAbQB+gRPFeBOAG+qei0iMiPxYRCxKKTeAUgDeFnL1BfArEYkWkUsBjAbwbr3XnwfwGACPUqq1a6KvEpE0/+d1L4D/KqUa77u1zz0XQG8RsbSwn5dh/NtM9Bf7+wF8rpRytZKPKKywOFPY8Y9cfwWjCB+DMSpcVe/192Ac010DYD+Azf6X6qZ1f+vf/pmIlAL4CMaotrn9fQ6gAsaU7HvNtPECuADGtPdBGKPOpwDUL1TrYEwXb6n3vDuA9fXaXA3A5c91I/zHjZvxOYDh/n3dB2CRUqqw3usvwDhZqqUTweq3fRZADoA4GJ9v45+xtc99D4AVAA74p/b7N9HHxzCOab8OY4ZjKJo/3k8UtqThISmiyOM/a3kHgFillEd3nq4ixuVheQAmK6W+baHdWgAvKqWe6qpsRJGOI2eKSCJykX+auBeMS6fejqTC7HcTgC9aKsxEpAdPCKNIdQOMaVovjOnjn2tN08VExAXjRK0LNUchoiZwWpuIiCjIcFqbiIgoyLA4ExERBRkWZyIioiDD4kxERBRkWJyJiIiCDIszERFRkOF1zkREEW7btm19o6KinoJxO1cO2lrmA7DD4/H8dMqUKXmttj5JLM5ERBEuKirqqZSUlNHJycnHTCYTb37RAp/PJ/n5+Wk5OTlPAVjQWfvhNyQiIhqbnJxcysLcOpPJpJKTk0tgzDJ03n46s3MiIgoJJhbmwPk/q06tnyzOREREQYbHnImIqAGr3TmlI/tzOWzbWmtjNpunDB8+vMrj8YjZbFZXXnll4e9///tcs9nc5v0lJCRMqqys/LL+tr1798asWbMm8cYbbyxqc4cacORMRETaxcbG+vbs2bNr//79Oz/55JN9H3zwgeX222/v31H9f/vtt7GvvvpqUkf119lYnImIKKgMGDDA89RTT7meeeaZvj6fDx6PBzfccMPAsWPHjh4xYkTagw8+2AcAMjMzo6dOnTpy1KhRacOHDx+zevXqxPr9HD16NGrixImjXnnlFcvdd989YOvWrYmjRo1Ku+eee/pWVlbKokWLrCNGjEgbPXp02ttvv90dAB555JHe8+fPHzpr1qzhVqt17G233Zaq4zPgtDYREQWdtLS0Wp/PhyNHjkS9+uqrPS0Wi3fHjh27q6qq5LTTTht1wQUXlK5YsaLX/PnzSx544IEcj8eDsrKy4wPOw4cPR9lstmH33HNP9kUXXVSamJjoW758eb81a9bsB4Bly5b1A4B9+/bt+vLLL+N+9KMfDf/uu+92AMA333zTLSMjY2diYqJv0qRJaQsXLiyZPXt2ZVf+/CzOREQUlJQyTiD/6KOPeuzZsydh1apVvQCgrKzMvGvXrrjp06dX3HDDDVa3221atGjRsTPPPLMKADwej5x11lkjH3744UybzVbeVN+bNm1KvPnmm/MAYNKkSdX9+/evzcjIiAOAmTNnlqakpHgBwGazHVu7dm1iVxdnTmsTEVHQ2bVrV4zZbMaAAQM8SilZvnz5oT179uzas2fPriNHjmRcfPHFpeedd175+vXr9w4YMKD2mmuuOeWxxx7rDQBms1mNGzeu4r333rM0139d4W+KiLT4vCuwOBMRUVDJzs6Ouv7664dce+21eSaTCeecc07Jv/71r+SamhoBgG+++Sa2tLTUtG/fvpgBAwa4b7vttoKrrrqqYPv27QmAUUxfe+011759++LuuuuuFACwWCze8vLy46d+z5w5s/zFF19Mquvv6NGjMePHj68GgI0bN/bIzc01l5eXy7vvvttzzpw5TY6+OxOntYmIqIFALn3qaDU1NaZRo0al1V1KdfnllxcuW7YsFwBuvfXWApfLFTtu3LjRSilJSkpyv/vuu9+9//773R955JGUqKgolZCQ4H3ppZcO1vUXFRWFVatWHTj77LOHORwO76233loQFRWlRo4cmbZ48eKCO+64I+/qq68eMmLEiDSz2YzHH3/cFR8frwBg6tSp5ZdffvkpLpcr7pJLLins6iltAJCWhvZERBT+vv76a9eECRMKdOcIBo888kjvrVu3dnv++ecPtdTu66+/7jNhwgRrZ+XgtDYREVGQ4bQ2ERGR369+9atCAIW6c3DkTEREFGRYnImIiIIMizMRdQkRSRcR5X/4ROSYiHwhIveJSEon7E+JyC9beH2EP1PPjt43UXuxOBNRVyoBcAaAMwFcAeANAFcDyBCRDl0JKQAjACwDwOJMQYcnhBFRV/IopT6r9/x9EfkXgPUAXhWRkUopr6ZsVCfd0rFflNJLWr1uWkSmLFy4sOitt946CAButxt9+/adMHHixIq6+2E35Te/+U3/xMRE77333ptbf7vL5Yq+8cYbB61evfrAO++8073+fbVDAUfORKSVUqoYwB0AhgI4BwBEJE5E/iIih0WkRkS+FpEf1X+fiCwQkW0iUuGfIv9cROY0tx8RGSsiOSLygojMB/C2/6WD/ilwV722E0XkYxGp9Pf9koj0q/e61f+exf7+ykQkT0SWddwnE1ni4+N9e/fujS8vLxcAePPNN3v069fPfbL9Wa1W9+rVqw90XMKuxeJMRMFgDQAPgOn+5/8FcA2A+wFcAOALAKtEZCIAiMhQf5tP/K//GMA7AJpcr1dEJgFYC6MgL/H3d7v/5YthTLVf5G+b7G+bAGAxgJsBzAHwoYjENOr6QQCVABYBeBLAMhH5xcl8AATMnz+/ZOXKlT0BYMWKFUmXXHJJUd1rubm55rPPPnvoiBEj0iZMmDDq888/j6977ZtvvkmYPn36iCFDhoxdvnx5HwDYu3dvzPDhw8c03kdpaanp0ksvtY4dO3b06NGj01588cWgPKzBaW0i0k4pVSMiBQD6+Ue1NgBzlVLr/E0+EJERAO4GcCmASQDKlFL/V6+bd5vqW0ROB7AawIsAfqWM2yKWishef5MvlVKuem+5zf/nD5VSpf4+9gH4HMAlAFbUa7tTKXWD/+/vi0hfAHeJyL+UUr42fgwR7+qrry5atmxZ6uWXX168e/fuhKVLlxZu2rQpEQDuuOOO/hMmTKj86KOPvlu1alX3JUuWnLJnz55dALB79+74bdu27S4rKzNPmjQp7ZJLLilpbh933XVX6rx580pXrlzpKigoME+dOnX0ggULSnv06BFU/14cORNRsKhb+udsADkAPhWRqLoHgI8BTPW3yQBgEZHnROQHItKtmT5nAPgQwBNKqZtVYPcrngbgg7rCDABKqS0AXABmNmr7ZqPnbwDoD2BgAPuhRk4//fSqrKys2CeffDLp7LPPblBgt2zZ0n3p0qWFALBgwYKy4uLiqMLCQjMAnHfeecWJiYkqNTXVc8YZZ5Ru2LChuf8esHbt2h5/+9vfUkeNGpU2c+bMkTU1NbJ///7GMyLaceRMRNqJSByA3gByAQwAkAKgqeONXgBQSu0VkYUA7DBGzG4ReRPAr5VS+fXa/wDG77nn2xAnFcDOJrbn4vvT5nnNPE8F0OK9malp5557bvGyZcsGffDBB3vz8vKO16imvleJiPL/2Xh7s/0rpfDf//53/4QJE2o6LHQn4MiZiILBPBhFdDOAIgBHAJzWxKPumDSUUk6l1CwYRX0pjBH3o436/ROM49kf+o9TB+IogL5NbO/nz1Zf43Z1z48GuC9q5Kabbiq47bbbsqdNm1ZVf/v06dPLnnnmmd4A8M4773Tv1auXJykpyQcA7733Xs/KykrJyckxf/bZZ91nzpxZ0Vz/8+bNK12+fHk/n8+Yxf7000/jm2urE0fORKSV/yYgDwDYD+AjAArGcd9ypdSe1t6vlCoB8LL/TO0zGr3shnGy1rsAPhKRmUqpI/7Xav1/xjV6z+cAbhKR7kqpMn/G0wBYAWxs1PYiAP+q9/xiGIU5q7XcQS2AS586y9ChQ92///3vG89I4IEHHshevHixdcSIEWnx8fG+Z5999vjykJMmTaqYP3/+8Ozs7Jjbb7/9qNVqde/du7fJqWqHw5H9s5/9bPCoUaPSlFIycODAmmC8xIpLRhJRlxCRdAC3ADjXv6k7gCkAboJxZvS5SqltYsxJvgNgPIyivRNADwATAcQppe4UkRtgFOLVALIBDIdx5vTzSqlb/PtTAG5WSj0mIokwCr8FwGylVL6IDIFxHPlhAK8AqFRKZfjP1v4WwC7//hMBOAAcAzBVKVUrIlYAB/37fgfA6wBmA7gLxtR64xF8UOOSkW3X2UtGcuRMRF3JAmPqWgEohTFafhHAo0qpHABQSikRuRhGobsFwGAY08lf4cS09TcAFgB4CMZx4KMwLmX6Q1M7VUqVi8h5MKa43xeReUqpTBG5HcCvYFwulQXA6i/c8wAsh3Fmdi2MkfetSqnaRl3fAeB8GMW5GsAfATx28h8PkYEjZyKiNqo3cr5AKfWO3jTtx5Fz23X2yJknhBEREQUZFmciIqIgw2PORERt5L+jWPMX0xK1E0fOREREQYYjZyIiamDcc+M6dMnIjCUZLV43nZOTY547d+5IACgoKIg2mUwqKSnJAwBfffXV7ri4uAZnLrvdbiQlJU0sKyv7qnFfCxcuPGXRokXHrr766uLm9vfwww/3vvjii0sGDx7sObmfqPOxOBMRkVYpKSneukUsmlufuSO98MILfaZNm1YZzMWZ09pERBS0zjrrrGFjxowZPWzYsDEPPfRQn/qvLV26dFBaWtroM888c3hOTo658XvXrVuXcNppp40cM2bM6NmzZw8/fPhw1JNPPtlr9+7dCYsXLx46atSotOrq6qA8d4AjZ6IQZLU7Y2DcVcsHYx1kLwCPy2Hzag1G1MFWrFhxsF+/ft6ysjLTxIkTR1999dXHevbs6S0vLzdPnz69/Omnnz58yy239L/rrrv6/+c//zlc976qqiq55ZZbBr/77rv7U1NTPf/617+S7rjjjgErVqzI/Pe//9330UcfPXTmmWdWtbRvnViciTSy2p11qzH1hnGnq0AfTS6JZ7U7AaNQ1z08zfxZCyAfxp21mnpkA8h3OWy8SxFpdf/99/dbvXp1TwDIzc2N2b17d+wZZ5xRaTab1XXXXXcMAK677rrCxYsXn1r/fV9++WXc/v374+bNmzcCAHw+H1JSUppa6SwosTgTdTKr3RkNYBiAkQBG1XuMBNCzE3Zp9j9aM6yV1z1WuzMXTRfv7wDsdDlsXH2JOs1bb73VfdOmTd23bdu2OzExUU2ZMmVkVVWVCWh5WUjAWBpyxIgRVdu2bdvbJWE7GIszUQex2p290LD41j1ORWj+vxYFY23lAc01sNqdRTAWptgJYEfdw+WwFXZJQgprxcXF5p49e3oSExPV1q1b4zIyMo7PGHk8Hnn++ed7XXvttceeffbZ3qeffnp5/fdOnjy5Ojc3N2bNmjUJ8+bNq6yurpYdO3bETp06tbpbt26+0tLSQL7AahOKvzCItLPanakA5gCYCWP1pFEAkrWG0iMJwCz/4zir3XkYwDb/YyuAbS6HLb/r49HJaO3Sp65y2WWXlTz11FPJI0eOTBs2bFj1+PHjj6/TnJiY6N22bVvCgw8+mNKzZ0/vG2+88V3998bHx6tXXnnlu1//+teDysvLzV6vV375y1/mTJ06tfonP/lJwY033miNi4vzNXWpVjDgwhdEAbDanafAWBKw7tHalDB9X13B/gzAhwC+5DHt4MCFL9qOS0YSaWC1O0fBKMJzYIwKB+lNFBYG+R8XwlgfOd9qd34E4H0AH/D4NdEJLM4U8ax2pwnAOBiFeDaMYtxXa6jIkAzgSv8DVrtzB4AP/I/1LoctaC9zIepsLM4UkfwFeTaAywBcDKCf3kQEYKz/8RsA1Va7cwP8xdrlsH2jNVn48/l8PjGZTDzMEACfzycw7jHQaXjMmSKGvyDPwomCnKI3EbXBURjHqVcBeMflsNVozhNWvv7661UpKSlpycnJJSzQLfP5fJKfn2/JycnZNWHChAWdtR8WZwpr9QrypQAuAQtyODgG4DUAz7sctk26w4SDbdu29Y2KinoKxswFb+vcMh+AHR6P56dTpkzJ66ydsDhT2PEX5Jk4MUJO1ZuIOtF+AM8DeMHlsLk0ZyHqMCzOFDasdudMAJfDGCGzIEcWBWADjEK90uWwlWrOQ9QuLM4U0qx2ZzcAPwFwM4DRmuNQcKgC8D8YhfoDLgZCoYjFmUKS1e60AvglgKXonPtTU3g4CuBlAM+6HLYdusMQBYrFmUKK1e6cC+DXAC5AYIs7ENX5EIDD5bB9ojsIUWtYnCno+ZdV/DGAX8G4jzVRe3wB4AEAb7octk69VpXoZLE4U9Cy2p0DAPwCwPUA+miOQ+FnL4AHYZzpXas7DFF9LM4UdKx255kwpq4vBu9iR50vG8DfADzuctjKdIchAlicKYhY7c4ZAO6DcY9roq5WDOCfAP7uctg67eYSRIFgcSbtrHbnZAB/AnCe7ixEMC7FegbAX10O20HdYSgysTiTNla7Mw3AvTCmr0VzHKLGvDAuw7rb5bAd1h2GIguLM3U5q905BMAfYZyBzfv4UrCrBPAXAH/hMpbUVVicqctY7U4LgLtgnOwVqzkOUVsdBvBbl8O2QncQCn8sztTprHZnFICbAPwBvCSKQt+nAG5xOWxbdQeh8MXiTJ3KandeCOOGDyN0ZyHqQArAcwDucjlsR3WHofDD4kydwmp3jgDwOIC5mqMQdaZyAPcDeMjlsNXoDkPhg8WZOpTV7jQDuA3APQDiNMch6ioHAdzuctje0B2EwgOLM3UYq905Fsb1oVN1ZyHSZC2M49Ff6w5CoY3FmdrNandGwzgL+y4AMZrjEOnmgTHV/UeXw+bRHYZCE4sztYvV7pwK4D8AxunOQhRktgK4yuWw7dUdhEIPizOdFP8yjvfAOL7MdZWJmlYF4A4A/3A5bPxlSwFjcaY28y9Q8R/w8iiiQH0A4FqXw5atOwiFBhZnCpjV7uwG4M8w1ljmbTeJ2qYIwE0uh+013UEo+LE4U0Csduc8GKNlq+YoRKHuZQC/cDlsxbqDUPBicaYWWe1OgXEW9r3gaJmoo2QBuMblsH2sOwgFJxZnapZ/oYrnACzUnYUoDCkAjwCwuxy2at1hKLiwOFOTrHbnGABvAhiuOwtRmNsF4EqXw/aN7iAUPDhNSd9jtTuvAPA5WJiJukIagE1Wu/Mi3UEoeHDkTMf5l3Z8EMAturMQRSAF4Hcuh+1+3UFIPxZnAgBY7c5+AF4DMFt3FqII9zKApTwOHdlYnAlWu/NMACsB9NedhYgAAFsAXMi1oiMXjzlHOKvd+UsYK+mwMBMFj2kAtljtzsm6g5AeHDlHKKvdGQ/gCQBX6c5CRM2qhHE99ErdQahrsThHIKvd2QfAe+C6y0ShQMG4CdA9XDwjcrA4Rxir3TkQxk34R+vOQkRtshLAEpfDVqU7CHU+FucIYrU7hwP4EMAQ3VmI6KRsA7DQ5bAd0R2EOheLc4Sw2p0TALwPoJ/uLETULkcBnMs7ioU3nq0dAfyXSq0FCzNROEgF8In/CzeFKRbnMGe1O38IYyq7p+4sRNRhesMo0JN0B6HOweIcxqx25yIAqwAk6M5CRB0uCcDHVrtziu4g1PFYnMOU1e5cCuAVADG6sxBRp+kF4COr3TlNdxDqWCzOYchqd94O4CkAZt1ZiKjT9QTwgdXunK47CHUcFucwY7U774OxshQRRQ4LgPf9J39SGOClVGHEanf+HcCvdOcgIm3KAZznctg26g5C7cORc5iw2p1/AAszUaRLBPCe1e7k0q8hjiPnMGC1O38G4HHdOYgoaFQCON/lsK3RHYRODotziLPanQsBvA6e/EVEDVUBuMDlsH2sOwi1HYtzCLPanTNh3GAkTncWIgpKVQDmuxy2zbqDUNuwOIcoq905BsAGGNc5EhE1pwDAGS6Hbb/uIBQ4FucQZLU7BwHYBGCg7ixEFBL2wyjQBbqDUGB4tnaIsdqdvQCsBgszEQVuGIC3rXZnvO4gFBgW5xDi/x/rHQBpurMQUciZDuBFq93J3/shgP9IIcJqd5oBvAqAdwAiopN1MYDlukNQ61icQ8fjAC7QHYKIQt4tVrvzRt0hqGU8ISwEWO3OewD8QXcOIgobHgDnuBy2tbqDUNNYnIOc1e5cAOB/unNQaPKUFSD7yRuh3NUYdOtKmGK+fz5Q0UdPoGzbKvQ47SL0Omtpq31WfvsZite/APexbET3TIFlxpXoNvrE3SJr8zNxbM3TcOe74K0qhTmhF+JPmQTLrKsQlZjUoT8ftUshgGkuh+2A7iD0fZzWDmJWu9MK4DndOSh0HVvzDCSm+XvU1BYcQnnGh5CYhID6q87aifw370fckPHod+k9iB96GgpWPYiqg9uPt/HVVCLK0g+95l2Hfpfdi54zF6PK9RXyVqZD+bzt/pmow/QGsMpqd3bXHYS+j8U5SFntzhgAr8FYq5WozaoP70D1wW3ocdrFzbY59tHj6DFlAUxxiQH1WfLpK4gdNBZJZ9+AuCHj0WvedYg7dTJKPn3leJu4gaPR+4e/QLe0uYgbPB6J489B73NvhjvvAGrzDrb756IONQbAyzyDO/jwHyR4/RXAabpDUGhSPi+KPnocljOvhDmhR5NtKvZshLvwMHpMvzSwPj1uVB/KQLdRMxts7zZ6Nmqy98BXU9Hse03x/sGZ1xPYD0Bd6XwA9+sOQQ2xOAchq925CMDNunNQ6Cr/6j0ojxvdJ9uafN3nrsGxNU+j55xrYGph2rs+d/FRwOdBdO+G97+J7j0IUD64i4402K6UD8rrhrswC8XrnkNM6nDE9B9xcj8QdbbfWu3O83SHoBOidAeghqx251AAT+vOQaHLW1WK4g0vos/5t0HMTf8vXvrZSpi7JaHbmHkB9+urLgcAmGIbToHXTYnXvV4nb2U6qv3HomNShqHvonSIcDwQxJ6x2p3jXA5bvu4gxJFzULHanbEAVgJoeh6SKADF659HTOpIxA9t+qiIuzgHpVveRNL86yEibd9B47ccv+Kj4QtJZ9+AlKuXo/f5t8FXW428lcugPLVt3x91lX7gwCBosDgHl78DmKQ7BIWu2vxMlH+4hsxBAAAgAElEQVTzEXrOuBK+6nL4qsuh3DUAAF9NBXzuGhSvew7xp05BdO+Bx9tAKSiv22jfzOWVJ0bIDY8t1x1rNsV1a7A9OmkAYvuPROKYeeh32b2ozT2Ail1rO/gnpg52AW9QEhw4rR0krHbnlQBu0J2DQpvnWDbg8yDnxdu/99qRf16DxPE/gLsoC+68g6jct6nB62Xb30HZ9ncw4KZnEdWjz/feH90zFTBFwV2UhbjB445vdxdmAWJCdNKAZnNFWfrCFN8d7uLcdvx01EWWW+3OtS6HbY/uIJGMxTkIWO3OkQCe0J2DQl/swDT0u7LhibdVB7aj9PP/ou+idET1TIGvtgrKXdWgTf6qvyBu0Dh0n3QezAmWJvuWqGjEDR6Hyj0b0X3iiXOHKvdsQGz/UTDFdmvyfYBRwH1VpYiy9GvHT0ddJAHG5VXTXQ4bj0NowuKsmX+lqZUAArvQlKgF5gQLzIPHN9jmKckDAMQOGtPkHcIAQMwxiOreB3H13lu+42MUvvt3DLjhKURZ+gIALDOuQO7Ld6LooyeQMGI6qr7biqrvtqLvZfccf9+xT54GTGbE9h8BU2wi3IWHUfL564jqmdrgTmIU1CYB+COA3+oOEqlYnPV7DMC4VlsRdTWlAOUDcOIYdNzAMUi+8E4Ub3gBZV+9iyhLCvpccDviT5l8vE1MyjCUbX8H5V+vhvK4Ye6RjISRZ8Iy/dKAL9uioHC71e5c7XLY1ugOEol4b22N/NcVvqs7BxFRM7IAjHc5bMd0B4k0PFtbE6vdmQDgn7pzEBG1YCCM5Wqpi7E465MOwKo5AxFRay612p1LdIeINJzW1sBqd04AsBU85k9EoaEMwEQuL9l1OHLuYv7VX54ACzMRhY7uAJ7RHSKSsDh3vZ8DmKY7BBFRG8222p1X6A4RKTit3YWsdmd/ALvBe2cTUWjKAjDS5bBV6g4S7jhy7lqPgoWZiELXQAB36Q4RCThy7iJWu/MCAKt05yAiaqdqAGkuh+2g7iDhjCPnLmC1OxMB/EN3DiKiDhAH4CHdIcIdi3PXuBfAIN0hiIg6yIVWu/Mc3SHCGae1O5nV7pwC4HMAZt1ZiIg60G4Yt/b06A4Sjjhy7kRWu1Ng3PqOhZmIws1oADfrDhGuWJw712UApugOQUTUSZZZ7c6+ukOEIxbnTuK/E9gy3TmIiDqRBcD9ukOEIxbnzrMYxrQPEVE4u9Zqd07VHSLc8ISwTmC1O6NgnCwxTHcWIqIusBnADJfDxoLSQThy7hw/AQszEUWOMwBcqjtEOGFx7mBWuzMawO915yAi6mJ36g4QTlicO951AKy6QxARdbGJVrvzPN0hwgWLcwey2p2xAH6nOwcRkSYcPXcQFueO9TMYq7YQEUWiWVa7c4buEOGAxbmDWO3OeHApNSIijp47AItzx/k5gBTdIYiINLNZ7c7xukOEOhbnDmC1O7sB+K3uHEREQcKuO0CoY3HuGDcDSNYdgogoSFxmtTuH6g4Rylic28l/hvZtunMQEQURM4A7dIcIZSzO7XcFgD66QxARBZklVrszVXeIUMXi3H6/1B2AiCgIxQL4je4QoYoLX7SD1e48HcBnunMQEQWpcgCDXQ7bMd1BQg1Hzu3DUTMRUfMSwd+TJ4Uj55NktTv7AjgEY+qGiIiadgTG6NmnO0go4cj55F0PFmYiotYMADBfd4hQw+J8MtItps2xv5h7g/ntTbGordYdh4goyP1Ed4BQw2ntk5FuOQ/AuwCgFEq+VMO+ecB9Re/PVVqa5mRERMGoEkCKy2Er0x0kVHDkfHKur/uLCCyTTftnvRr7p7TdsdfsuyfqmfVJKCnUGY6IKMgkAFikO0Qo4ci5rdItKQAOA4hqrolSqD2k+m5/2HNJ1P98Myb5YDJ3XUAioqC0zuWwzdUdIlSwOLdVuuU3AJYH2tyrTEc/8U3c+2fP4lMOqP5DOjEZEVEwUwBOdTlsLt1BQgGntdtucVsam8WXeo55+9yPY24f/GXsz77+pfnNT+NRU9lZ4YiIgpQAuFp3iFDBkXNbpFuGA9jX3m6UQlmGOuWrv3iu6LXRN25sByQjIgoF+10O23DdIUIBi3NbpFuWAUjvyC5rVPR3b3hnZi33XJpWgJ5cdpKIwt0Ml8O2SXeIYMdp7ba5sqM7jBX30Cuj1sz5IvbnvTbG/mrLZeY1W8zwejp6P0REQWKJ7gChgCPnQKVbJgPY1hW78irJ2+Abt/vPnsWD96rBp3TFPomIukgJjGueeQOnFjR7ORB9T5tOBGsPs6i+c83f9J1r/gYlKiHjWe8PS57wnD+xAvGJXZWBiKiTWAAsBPCq7iDBjNPagbtMx04tUjnu11FvztwRu1ScMXdunGf68hsdOYiIOhBv59kKTmsHoguntANRq6Jcb/vOyPyr+7KRR9E7RXceIqI2cgNIcjls5bqDBCuOnANzvu4A9cWIx3qJecOcTbE3J2+O/eUXPzZ/9HkUPG7duYiIAhQNYJ7uEMGMxTkwQVWc64jAnCpFp90X/Z/T98UuKXkp+r51Y+Tgft25iIgC8APdAYIZp7VbY9xLOxvG3W1CQrmK2/WC95zCf3oWjC9DN4vuPERETdjncthG6g4RrFicW5NuWQrgKd0xToZSqNqnBm5f7rms2we+KRMACZkvGEQUEawuhy1Td4hgxGnt1gXllHYgRBA/0pQ144mYhyZ+G/uTw3+PfnTdQMnP1p2LiMjvHN0BghVHzi1Jt8QCKAAQNtcXKwVfPnpu/6dngfsl79lT3IiK0Z2JiCLWSpfDpuUy1WDH4tySdMt8AB/pjtFZfEqKvlAjd/zZvbjfV2oYj/0QUVcrApDscth8uoMEG05rt2yu7gCdySQq6XTTntlvxf5h5M7Ya/f8LuqF9RaUF+vORUQRIwnAVN0hghGLc8vm6g7QVbpJzaifRr03+6vYn8V/HHPbZpvps20CH7/NElFn4yVVTeC0dnPSLfEAigFE7DFZjzJlfeCb+t0DniuGZqqUgbrzEFFY2uBy2GbrDhFsWJybk26ZB+AT3TGCgVJQhejx1ZMeW9Wz3h9OrkFMnO5MRBQ2eCvPJnBau3lzdAcIFiKQPlI66c7oFWfuib2m5vWYZeunye7dunMRUVjgrTybwOLcvLm6AwQjEVimmL6d/VrsH0fvjr1m3z1Rz6xPQkmh7lxEFNJ43LkRTms3xbi+uRgAp28DoBRqD6m+2x/2XBL1P9+MST6YzLozEVFI2eVy2MboDhFMWJybkm45HcBnumOEIq8yHf3EN2nfnz1XWg+o/kN05yGikOAF0N3lsFXpDhIsonQHCFKTdAcIVWbxpZ5j3pZ6tmmbKkbi1097zit/2vujSVWITdCdjYiClhnAGABbdQcJFjzm3LSJugOEOhFILymfcHv0yhm7Yq/1rIq5e8MM044dunMRUdAarztAMOHIuWkcOXcgEfQYLwdnvRRzP6pV9HdveGdlPeRZlFaAnsm6sxFR0GBxrofHnBtLt5gBlAGI1x0lnCkFzxH02f6I5yK87p092QszvygSRbY1LoftLN0hggWLc2PpljQAO3XHiCReJXnrfeN3OTxXDtmrBp+iOw8RaVHgctg4m+bH4txYuuXHAF7UHSNSlaiEjGe9Pyx5wnP+xArEh81SnUQUkAEuh41rzoMnhDWFJ4NpZJHKcb+OenPmjtil4oy5c+M805ff6M5ERF1mgu4AwYLF+ft4MlgQEEG3MabMmc/EPDh+X+xPDv41+t9rU1CUqzsXEXUqnhTmx5Nwvo8j5yATI55TFpnXn3KJab03B0lfPOa50Peqd+5kD6KidWcjog7F4uzHY871pVsGATikOwa1zqekYLMvbef9nsUDdqpThunOQ0QdYofLYRunO0QwYHGuL91yPoC3dcegtilT8Ttf8J5T9E/PggnlSOihOw8RnTQ3gESXw1arO4huPObc0Km6A1DbdZeqMT+PWjUrI/an0e/H3PHpD0xbvwL4rZMoBEUDGK07RDDgMeeG+usOQCdPBPEjJWvGEzEPwa3Mh971TTv4oOeK4Vkqmf+uRKFjPICvdYfQjcW5If4SDxPR4h280Lx58ALTZl8+em79p2eB+yXv2VPciIrRnY2IWjRSd4BgwGnthlicw4wITH2leGp69PNn7I1dUv5qzL3rJ8r+vbpzEVGzUnQHCAYcOTfE4hzGTKKSTpc9s9+K/QMqVOyeFd75eY96LhxfgsSeurMR0XH9dAcIBhw5N8TiHCG6Sc2on0a9O/ur2J/FfRRz2yab6bPtPImMKCiwOIOXUp2QbkkAUKE7BunjUaasD3xTv3vAc8XQTJUyUHceogh1yOWwDdEdQjdOa58wQHcA0itKfAN/ZN4y8DzTFlWIHl8+6bFVPev94eQaxMTpzkYUQThyBkfOJ6Rb5gBYqzsGBRelULJdDf/6AfcVyVvUaF5/SdQ1erocthLdIXTiyPkEHm+m7xGBZYp8O/u12D+iSsXse807J+dhzyVjj6FHku5sRGGsH4CILs48IewEFmdqUbzUjlgS9eHs7bE3Jq6NufWzC00btwp8Pt25iMJQX90BdGNxPoHFmQIighirKXf6wzH/nPpt7E9yn4xevu5Uyc7UnYsojET8cWdOa5/A4kxtFiW+1HPM21LPNm1TxUj8+mnPeeVPe380qQqxCbqzEYWwiC/OHDmfwOJMJ00E0kvKJ9wevXLGrthrPati7t4ww7Rjh+5cRCGKxTmQRiLycSDbQhyLM3UIEfQYbzo466WY+8fuiV3y3f1RT63rg+J83bmIQkjEF+cWp7VFJA5AAoA+ItILgPhf6oHwK2a8nyt1uDhxD10c9cnQK82fuI+gz5a/ey7G697ZU3wwmXVnIwpiEV+cWxs53wBgG4BR/j/rHv8D8I/OjdblYnUHoPAlguiBUjDtwegnpn0be3XBM9EPrBsphw7qzkUUpCK+OAd0ExIRuVkp9WgX5NEn3eIBwNEMdakSlZDxrPeHJU94zp9YgfhE3XmIgsRel8M2SncInQK+Q5iInAnAinpT4Uqp5zsnlgbpFh9OTNsTdSmlULFLDfnyr57LeqzxTRqvOw+RZgdcDttQ3SF0CuhSKhF5AcBQAF8B8Po3KwDhU5xZmEkjEXQbI5kzn4l5ELUq6uD/vGceWu65bFQOkiJ+eo8iUsRf5hvotPZuAGkqXG/EnW4x4cSXDqKgoBS8OUja/pjnQt+r3rmTPYiK1p2JqItkuxy2iF6MKNDrnHcgvM9m5vXeFHREYE6VotPui/7P6ftil5S8GH3/ujFycL/uXERdIOJHzoF+AH0A7BKRLQBq6jYqpRZ0Sqqux+JMQc0kqs9M8445TvPdKFPxO1/wnlP0T8+CCeVI6KE7G1EniPiTcwOd1p7T1Hal1LoOT6RDuiUOQJXuGERtoRSq9qmB25d7Luv2gW/KBEB43gSFixKXw9ZTdwidAho5h00Rbh5HzhRyRBA/UrJmPBHzENzKfOhd37SDf/Yt7J9njuM1+xTqKnUH0C3QkXMZjLOzASAGQDSACqVUeEyppVsSAZTpjkHUXpUiFTf3S966JS52JkQifmqQQlZ5xpKM7rpD6BToyLnBhyQiFwKY1imJ9ODImcJCglLdns7Jm7M1LnbXTf2SzdUm00jdmYhOQsRfPXNSRUkp9RaAszo4i04szhRWplbXpG3OzBp6cVn5OigV8VOEFHIivjgHehOSi+s9NQGYihPT3OGAxZnCThQQdU9B0Zxri0sPXZvaL78gyjxFdyaiAHl0B9At0EupLqj3dw8AF4CFHZ5GHxZnCltWj2fwmsNHBj9t6bHp770sI5RIH92ZiFrBkXMgjZRS13Z2EM14CQqFvaUlpWdeXFZe9NOUvhv3xcbM1J2HqAURX5wDGjGKyEAReVNE8kQkV0ReF5GBnR2uC4XTFD1Rs3r5fEmvZ+fMfCg3f3uUUpm68xA1I+LvOxHodO4zAFYB6A9gAIC3/dvCxTGwQFMEOaeyavLmzKy+syur1kGpiD++R0GnUHcA3QItzslKqWeUUh7/41kAyZ2Yq2ull3gBFOmOQdSV4pSK/0du/pyXs3MPdPP5durOQ1QPi3OA7QpE5CoRMfsfVyH8Prxc3QGIdBhXWztiU2bW6MUlZeuhVLnuPEQIv/rSZoEW5+sAXAYgB8BRAIsAhNtJYnm6AxDpYgJMdxYdm706K7s0xePZojsPRTwW5wDb/RHAEqVUslKqL4xind5pqfRgcaaIN8Dj7f/h4expdxYUbTYpxf8nSBcW5wDbjVdKHat7opQqAjCpcyJpw19ERH6Ly8rPWH8oK3Zsdc0GBHIDfqKOxeIcaDsR6VX3RESSEH6LYbM4E9Vj8SnLiqO5s/6Zm58R41Pf6c5DEYXFOcB2ywFsEpE/isi9ADYB+EvnxdKCxZmoCbOqqsdvzjw86IflFWuhVK3uPBQRWJwDaaSUeh7AJTDOaM4HcLFS6oXODKYBizNRM2KAmL/mF87975GcrB5e79e681DYi/jiHPDUtFJqF4BdnZhFN15KRdSKkW73qRsPHVGP9LJseMrSYzxELLozUViK+OLMBR9O4MiZKAACyK+Plcz6+HB2zWC3e7PuPBSWWJx1BwgiLM5EbdDX6+3rzDp6xr35hVvMSmXrzkNhozJjSQbvra07QNBILykFUK07BlGouai8YtqnmVk9plZVr4dSPt15KOQd0B0gGLA4N5SvOwBRKOqmVOIzOXmzn87J2x3n8+3TnYdCGv/7AYtzY5zaJmqHadU1YzZnZp26sKx8HZSK+KlJOil7dQcIBizODR3RHYAo1EUBUX8qKJqz6sjRvCSvd7vuPBRyOHJG+N3lq712AligOwRRODjF7Rmy7tCRIU9Yemx8rJclTRl3FgxppdtKkftmLmpzahHVMwq9z+6NPuf2afE9Po8Pef/NQ+WBSlQdrIJyK4x9dmyT7QqcBSj+tBjuY25E94qG5QwLks9Phik6osZRLM7gyLmxHboDEIWbn5WUzlx76IgaVlv7qe4s7VHxbQUOPXYICacmYPAtg9FrVi/krMxBwfsFLb5P1SgUrS+CKcaEhGEJzbbLXZmLfGc+ks5KwpDfDEHSvCQUvFuA3Nci7hYMLM7gyLmxDN0BiMJRks/X+80jOTNWd0vYdmdy72SPyGDdmdoq/3/5SBiegAHXDQAAdB/bHd5KL/JX5SNpfhJMUU2PdczdzBj9j9EQERR+VIiK3RVNtivZXIKkeUnHR+KJoxPhLnajeHMxUn+c2jk/VPApyliS0fK3nQjBkXNDewC4dYcgClfnVlRO2ZSZ1WdGZdU6KOXRnactqg5VIXFMYoNtiWMS4a3womp/y+e+iUir/SuvgjnB3GCbOcEMRNaaYN/qDhAsWJzrSy9xg1MqRJ0qXqmEf+fmz3nxaO53CT5fyNwSWLkVxNywyEq08bwmu6bd/fea0wtFa4pQ8W0FvNVeVOytQNEnReh9du929x1C+PvXj9Pa35cBYIzuEEThbkJN7chNmVleR+9e61/pnjgZIomtv0ufmL4xqDrYcIRcdcB47qlo/yRAv0v7wVfrw8H7Dh7flnRWEvou7NvuvkMIL6Py48j5+3hSGFEXMQPmuwuPzX4vK7ukr8fzhe48LUmal4TSL0tRtLYI3govyjLKULDaODwqptanrVtT8F4BSjaXIPWqVJxy5ylI/XEqijcXI/eNiDohjCNnP46cv48nhRF1sYEe74CPD2cPeKFH980PJvUcpkSSdWdqrNfsXqg+XI3s57OR/Ww2JEaQclkKjr54FFE92ver1FPmQd7reUi9OhVJc40rzrqN7AaJEmS/mI3eZ/du9z5CBIuzX0T8a7cRR85EmlxdWnbGgvKK4htSkjfsjImZGdCZVF1ETIL+V/dHv4v7wV3kRnRyNGqP1gIAEoY2f4lUIGrza6G8CnGD4xpsjxsSB3iB2oLaSCjONTBOyiVwWrspBwGU6w5BFKksPl/PV7JzZz2aV/B1tFIHW39H1zJ3MyNuUBzMcWYUflKIhGEJiO0f264+Y3rHAACqMxuuvVPlMo5px/SJaVf/IWJ7xpKM9p9ZFybC/qtYm6WXKKRbdgGYpjsKUSSbW1k18TPX4Zrf9u2z7qOE+DMgorVCVe6vROW3lYgbHAdvlRcln5WgfEc5Tr371Abt9t2xDwkjEzBw6cDj28q+KYOvxofqQ0bxLfmiBAAQf0o8YvrEIMoShe6TuyPntRz43D7EDYpD9aFq5L2Vhx6n9YiEUTMAcG3weiLiX/wkbAeLM5F2MUDs3/IK5uyOif5uaUq/ijKzabyuLBIlKNlSgry38gABEkYk4NS7T0XcoIZT0cqrgEYLZ2Y/lw134YlbKBz+x2EAwIClAxAzy/jOMfD6gcj7Xx4KPyyEp9iD6F7RSJqbhOQFQXf4vbNs0h0gmIhSkXWFe0DSLVcBeEF3DCI6QQHqoV49Nz5r6T4eIhbdeajD9c9YknFUd4hgwWPOTQvpewAThSMB5LZjxbM+PJxdNdDt/kx3HupQmSzMDbE4NyW95CAA/odCFIRSvN6U97KOTl+WX/i5SSn+fxoeOKXdCItz8zh6Jgpii8orTv80MytxUnX1eijla/0dFMRYnBthcW7eRt0BiKhliUp1f/5o3uwnc/J2xfp8XDQhdPFM7UZYnJvHkTNRiJheXTN2c2aW9fzyirVQqrr1d1AQqQDwte4QwYbFuXlfwfiPhohCQDQQ/ef8wrlvHTma08vr/VJ3HgrYFxlLMkJq+dCuwOLcnPQSD4ANumMQUdsMdXus6w8dmfTzY8UbRaljuvNQqzil3QQW55at1h2AiE7OTcWlM9ccOuI9tdbNQ1TBbZ3uAMGIxbllLM5EIay3z9fnf0eOznDkFWw1K5WlOw99TyVYnJvE4tyS9JK9MBbCIKIQZquonLo5MyvpjMqqdVDKqzsPHfdJxpIMnsDXBBbn1r2vOwARtV+8UglP5ObPee5o7r54n2+37jwEAHhXd4BgxeLcOk5tE4WRyTW1ozdnZo24tLRsHZTiFRl6OXUHCFYszq37GIC71VZEFDLMgPkPhcfmOLOOHkv2eLbqzhOhdmYsyTjU3k5ERAXwmBtgX6P87c9upZ1DRNp0DoOIJIhIuoiMDaQ9i3Nr0kvKwbuFEYWlwR7PwE8OZ0+9rfDYJlEqX3eeCPNOB/VzRr3HWf5tf2q0fXsH7as9EgAsAxBQceZ6zoF5A8A83SGIqHNcU1p25oXlFceuT+m7cU9szEzdeSLEmx3RiVLq+AplIpLo/+t39beHIo6cA/M6vrd8OhGFk54+X6+V2TkzH87N/zJaKZfuPGHuCIAtXblDERkkIs+JyEERqRKRvSKyTESim2jeS0ReEZFyEckRkTsD6D9ZRJ4WkTx//xtEZIr/tTgAdTMzK+pNt6c01x+LcyDSS46C99omigjzK6smbc48nDKvonItlOL5Jp3jrYwlGaqL99kXQA6AWwCcC+BvAG4C8Ncm2j4MoADAJQCeA3C/iCxtrmMRiQewBsBsAL8BcDGAMgAfi0gfADX+fQLA73Fiur2wuT45rR24lQBm6Q5BRJ0vViHukbyCuTtjYr79aWrfmnKTKaDjhBSwN7p6h0qpbQC2AYCICIwBVy2Av4vIb1TD69+3KaV+6f/7+yLSH8DvADzdTPfXARgKYLTyz7qIyCcA9gP4tVLq9yKyzd92fyBT7hw5B45T20QRZkxt7fBPM7PSflJSuh5KlerOEyYKoeGuYCJiEpH/E5E9AKpgXIXzNIBEAKmNmjc+Hv4GAKuI9G2m+7MBfA4gS0SiRCQKgBfG+gxTTyYvi3Og0kuywaltoohjAkz/V1Q8+4PD2RWpbs/nuvOEgTczlmTouEvbbwHcD+BVABcAmAbgVv9rcY3a5jXzvHERr9MHwBwYBb/+40oAg04mLKe124ZT20QRKtXrTf0gKzv11e6Jn9/fu9cQXwsn81CL/qNpv5cCeEkptaxug4hMbqZt4xFy3fOjzbQvgjF4u6WJ16raErIOR85t818YUxVEFKEuLys/fWNmVsKE6pr1UKqrT2oKdbsylmToWiIyHsaJWfX9uJm2FzV6fjGATKVU4xF1nY8BjARwQCm1tdFjp79Nrf/PxqP0JrE4t4Vx1vYHumMQkV7dlerx4tHc2f/Ozd8R41Pf6c4TQpo7oaorfAjgKhG5QUTOFZEVAAY203aKiDwqIj8QkQcAXAXgvhb6fgrGmeBrReQaEZkjIpeIyF9F5BcAoIxzFo4CuEJEZojIVP+x6SaxOLfdM7oDEFFwmFFVPW5z5uFB55VXrINSjUdl1JAbwAsa9/87GCd2OQC8BKAEwO3NtL0FQIq//TUAfq+UerK5jpVSlTCOOW+AUcQ/hHE51hAAX9Rrej2MLwQf+7f3aa5P4axMG6VbYgFkA0jSHYWIgsf+6OiD16T2LSkxmyfqzhKkXs9YkrFId4hQwZFzW6WX1MD41kVEdNwwt/uUDYeOTPjZsZKNolSx7jxBSOeUdshhcT45zU5vEFHkEkBuLi6Z+fHhI25rrVvXiU/BKAvA+7pDhBIW55ORXpIB44JzIqLvSfb6kt8+cvSM+/ILvjAr1aalBcPUsxlLMngTpzZgcT55T+gOQETBbUF55WmbMrN6TauqXoeGt4eMJAr6rm0OWTwh7GSlW+IBHAbQW3cUIgp+W+Nid93UL9lcbTKN1J2li32SsSRjvu4QoYYj55OVXlIF4HHdMYgoNEytrknbnJk19OKy8nUwLr2JFDwR7CSwOLfPYzhx1xciohZFAVH3FBTNeTvraEEfj3db6+8IeUdhLBpEbcTi3B7GHcNe1R2DiEKL1eMZvObwkSm3FBVvEqUKdOfpRA9lLMngzVlOAotz+/1NdwAiCk1LS0rPXHfoiGlETe1G3Vk6QRGAf+sOEapYnNsrveRLaFiblIjCQy+fL+n17JbKVe0AABQcSURBVJyZD+Xmb49SKlN3ng70WMaSjHLdIUIVi3PH4OiZiNrlnMqqyZszs/rOrqxaB6U8uvO0UwWAR3SHCGUszh3jbQB7dYcgotAWp1T8P3Lz57ycnXugm8+3s/V3BK0nM5ZkFOoOEcpYnDtCeokPwJ90xyCi8DCutnbEpsys0YtLytZDqVCbGq4F8FfdIUIdi3PHWQGOnomog5gA051Fx2avzsouTfF4tujO0wYvZCzJOKI7RKhjce4o6SVecPRMRB1sgMfb/8PD2dPshUWfmZTK052nFT4AD+gOEQ5YnDvWCgD7dIcgovDz49Ly6esPZcWOranZgOC97/J/M5ZkfKs7RDhgce5IHD0TUSey+JRlRXburH/k5n8To9QB3Xma8GfdAcIFi3PHexkcPRNRJ5pdVT1hs+vwwB9UVK6DUsFyC+F3MpZkfKU7RLhgce5oxuj5Xt0xiCi8xQAxy/MK5qzMzjncw+v9RnMcD4D/05whrLA4d46XAUTCTe2JSLNRte6hGw8dGbe0uGQDlCrRFOPfGUsy9mjad1hice4M6SUKwG26YxBRZBBAbjlWMuujw9nVg9zuz7p498cALOvifYY9Cd6T/sJAuuUtAAt1xyCiyPJmYrct6X2SBvlEUrtgd7dmLMl4uAv2E1E4cu5cdwBw6w5BRJHlovKKaZ9mZiVOqapeD6V8nbirfQD+0Yn9RyyOnDtbuuURADfrjkFEkWlLXOzOX/RLjq42mUZ0QvcLM5ZkrOqEfiMeR86d7x4AxbpDEFFkmlZdM2ZzZtapC8rK10Kp6g7s+hMW5s7DkXNXSLfcCuAh3THC3ZFSH0Y+Vo4KN1B2Z3ckxsjx1zJyvbjz4xpsOOSBTwGj+5jwL1s8pvQ3N9uf3FPa5PYYM1Dzux7Hn+/M8+LW96ux8ZAXCdGCS9Oi8OAP4hrsnygYHIiOyrw2tV9hkdk8uZ1d+QBMzliS8XVH5KLvi9IdIEI8AuBqAJN0Bwln//dhNRJjBBXuhl84v8rxYtYzFVg4MhqvLkoAAHxxxIsqT8tfTDcvTfjetgtWVGHGoBMFvaRa4aznKzGitwmvLopHYZXCHR/W4Gh5Fd664vvvJ9LpVLdnyLpDR4Y8bunx6T96WUYrkaST7Oo/LMydi8W5K6SXeJFu+RmAzwA0P1Sjk7Yh04PV+z34//buPMquosDj+Ld6IQswD0IE0bANIij7KooQ43FQZHBBgYBsI3gURhEXUAfUcsEFZXBGERTUIxNlVVQkrGEVPKyCiYkYkC0sgSTkZu2kl5o/bod0d0LoJumu+15/P+f06fR9r1//Otuv6966Vf+1/whOu3FZr8c+8cc2DnljC5MOHfXSsfe84ZX/6u87rvdz7n26kzlLEkfu1PrSsR/fu5yl7YmrjxzNRiPLkfKYUYH3X7qU+57pZK81jMylXD5eLNjvwwsXzTlh803venS99d42wE9fCJw5GLm0ktech0os7gN+mDtGI+rsSnzq2ja+Mn4EY0f3PpU8/YVO7n66k0/ts95af51LprWzfiscsv3K0n5wdlnAK4oZ4MBtWwjANf/oWOuvKQ2WTbq6xv7u6efedvbzc+5vSempAXzqN6YeN3X2oAUTYDkPtTOBJ3KHaDQX3NdOWwf8596rFvDdszoBeLEtsesFi2j5+gK2/d+F/OyBgS1HnFLiiuntvH+HFka3rizito7yGnRPLU3QFGDGnM6BfzPSEDto8ZI973pi1ib7LVl6Gym90k+U9+P8mSFhOQ+lWCwGTs4do5HMXdLFl29p47/fPYLW5lUnYD23qLyufOxVbXxk51ZuPGY079m2hROvbmPyzP7fgn7Hk53MWpCYuGNrr+Nv2LiJh2Z30d658vr1/c900plg3lInW6o+jEpp9AWzXxg/6dnZj47u6pr+Mk9rB06YetxUf+ocApbzUIvFZODy3DEaxRk3L+Mt45p573atq328q7sfT9yjldP3G8GEbVo47+BRTNi6mW//qf+j50umtrPxSHh3n2vVH9uzlRcWl6fVn1vUxd+e7+TkyW00B2j2X5fqzK7Llm9/1xOztj9iwcLbSGlxn4fPdhLY0PG/jzxOwXuf19rfnu/k539p56vjRzC/LTG/LbGkezBctCWWtifGjCpH0xO27l2q79ymhekv9G/hpI6uxG9mdPChN7WyXp/R+Q5jm/npISO5ZFo7m5+ziF0uWMw+r2tmt9c2sdn6/vNS/WmG5jPnvjh+8qxnX9y0o+Pe7sMzgG/kzDXcOFs7h1jMJtZOB36aO0o9mzmvi/YueOvPlqzy2LhzF3HC7q0cvcvqR9QpldeF+2PKPzt5YUniyJ1X/1of3X09jtq5lZlzu9h0/cDY0YFNzl7IiXus/SQ0KZctOjrGTXnqmXEX/8uGd54zZqPPP3T8tGWv/FlaVyznfC4CjgYOyB2kXr19y2ZuOa73vcTXPdLBd+9czuSjRvGvGzexzcZNbDwSpjzW0euU9JTHOth1s/6NbC+Z1s5rNwi8Y+uXvy1qZEtg583Kx3/54HK6Ehy+4+rLXKonxy5Y+OdjPztrqHe6GvYs51xikbrvfX4QGJk7Tj0aO7qJd2zdu2Afn1+eqt5/q5aXVuj6yvgRnH7jMjYaGdj7dc38ZkY7tz/RyW3Hryz2ix9azkd/38ajp2zAVhutfM1lHYnf/b2d43dbj6aw6lB7wbLEWbcv44CtmmlpCtzyeAfn/Hk5Fx4y8qVT6lIdm473NGdhOecUi4eJtU8DP8kdpZGduu8IuhL88J7lxFsT249t4srDR7H/Viv/+ncl6EzQd371tY90UCyDiTut/p9Kc4C/PNfJhQ8sZ2kH7LRpE1ccNooP7OCoWXWvAziWWHg6OwPX1q6CWPs/ylPcklQVZxCLb+UOMVw5nbQaPkE5G1KSquAm4Du5QwxnjpyrItbeDNwLuFuCpJxmA7sSC5fozMiRc1XEYjpwUu4Ykoa1BBxjMednOVdJLC4GfpY7hqRh67vE4sbcIWQ5V9EnAZfIkzTU7gK+nDuESl5zrqJY245y95cNc0eRNCzMBfYgFk/mDqKSI+cqisVM4ITcMSQNCx3AYRZztVjOVRWLK4Af5Y4hqeGdQixuyR1CvVnO1fY5yturJGkwnE8szs8dQquynKssFsuBw4EXc0eR1HBuody+VhVkOVddLB4HjgE6MyeR1Dj+SXmduSN3EK2e5VwPYnEN5RKfkrS2FgKHEIu5uYPo5VnO9SIWFwFn5I4hqa51AUd2r0ioCrOc60m5Q8wPcseQVLe+1H0mThVnOdefzwK/yh1CUt2ZRCzOzh1C/WM515tYJOA/gGtzR5FUNybjwkZ1xeU761WsjQamAPvmjiKp0m4A3kcsluUOov6znOtZrI0B7gDenDuKpEq6Gfh3YrE0dxANjOVc72JtHOVuMlvkjiKpUm4HDiIWS3IH0cB5zbnexWIWcCDlrjKSBOUP7AdbzPXLcm4Esfg78F5gce4okrK7h3LEvCh3EL16lnOjiMU9wKFAe+4okrK5H3g3sViQO4jWjuXcSGJxA3AUsDx3FElD7iHgQGIxP3cQrT0nhDWiWHsXcBWwQe4okobENGACsZiTO4jWDcu5UcXaXpQLD7wmdxRJg2oG8A5i8XzuIFp3LOdGFmtvBK4Hts6cRNLgmAmMJxbP5g6idctrzo0sFv8A9gOm5o4iaZ17GHinxdyYLOdGF4tngAOAP+WOImmduRnYt3udAzUgy3k4KGdvHghcnTuKpLV2EfAeZ2U3Nq85Dyex1gxcSLmrlaT6koAvuu3j8GA5D0ex9h3gC7ljSOq3JcAxxOK3uYNoaFjOw1WsfQY4Bwi5o0hao2cpt3y8L3cQDR3LeTiLtaOBnwOtuaNIWq2/Um75+FTuIBpaTggbzmIxCTgIeCF3FEmrmAzsZzEPT5bzcBeLKcDuwB25o0h6yQ8pT2W7s9Qw5WltlWKtBTgLOA2vQ0u5dAKfJhbn5Q6ivCxn9RZrBwMXA2NyR5GGmYXAEcTi2txBlJ/lrFXF2pbA5cBbckeRhol7gaO7l9yVvOas1YjFk8D+wA9yR5EaXCfwTeBtFrN6cuSsNYu1Qylvt6rljiI1mMcoFxa5M3cQVY8jZ61ZuSLRHsADuaNIDeRiYDeLWS/HkbP6J9ZGAOcCJ+WOItWxF4GPE4srcgdRtVnOGphYm0i5ecYGuaNIdWYKcByxeDp3EFWfp7U1MLG4FNgF8HYPqX+WAZ8D/s1iVn85ctarF2uHAf8DbJ47ilRR04CPEIu/5g6i+uLIWa9eed1sB+A8oCtzGqlKEuUPrntbzHo1HDlr3Yi1fYCfALvljiJlNgP4VPe69dKr4shZ60Ys7gH2ory2tjhzGimH+cCpwC4Ws9aWI2ete7G2BfAj4H25o0hDoJPyDoYvE4s5ucOoMVjOGjyx9gHKre/G5Y4iDZJbgFO9rqx1zXLW4Iq1DYCvA6cAzZnTSOvKY8Dnu1fQk9Y5y1lDI9Z2p5zV/dbcUaS1sAj4NnAOsViWO4wal+WsoRVrh1DuwrNL7ijSACRgEvBFYvFM7jBqfJazhl6sBWAi8DVgu8xppFdyN/BpYnF37iAaPixn5RNrLcDxwFeALfKGkVYxAzgL+DWx8D9KDSnLWfmVO159HDgdeH3mNNKdwNnA1ZaycrGcVR1lSR8PfAHYJm8YDTMJuBo42z2WVQWWs6qnPN19FPAlyrW7pcGyHPgV8D1iMSN3GGkFy1nVFWtNwIeAM4BdM6dRY1kI/BQ4120cVUWWs+pDrE0APgYcCozInEb1azblblHnE4v5ucNIL8dyVn2JtTHAMZRFvWPmNKofM4HvA7908RDVA8tZ9SvW9qUs6SOA9TOnUfUsA64BLqacee2e46oblrPqX6xtCBwJnAjsnTmN8krAbZSTvK701LXqleWsxhJru1KW9NHARpnTaOhMpVxe8xJi8VTuMNLaspzrRAghAl8FHkkprbLkZQjhEWBb4GsppdjP1zwe+AWwYUpp0RqedyswJ6X04R5ZPplSGjugb2IoxdpI4MOUp70PyJxGg2MW8GtgErGYmjuMtC615A6gAWkDtgkh7JVSum/FwRDC3sBW3Y8PhpOB9kF67cERizbKkdQkYm1r4ODutwnAyIzJtHYK4ErKP9vbXMFLjcpyri+LgQcoN424r8fxicDNwJ6D8UVTStMH43WHTCwep9yu8jxibRTwTlaW9ZYZk6l/lgDXU15H/qOzrTUcNOUOoAG7FDg8hBAAut8f3n28lxDC4SGEqSGEZSGEp0IIZ4UQVvcD2ZtCCHeEEJaGEP4RQvhgn9e5NYRw5ZpChRDGhBB+EkKYHUJoCyHcFUJ4y6v/NgdJLJYSi2uIxcnEYitgZ8rlQm8HOvKGU7cEPEi5vvW7gDHE4lBi8RuLWcOFI+f681vgfODtwB3A/sBrgKuA7614UgjhQOAyyttITqPcP/kbwCbAJ/q85mXAj4FvUU6muiKEsGdK6aH+BAohjABuopyAdRrwPHAScFMIYbuU0nOv6jsdCrGYBkwDzibWNgIOpBxRH0T5+6qh8RxwI3ADcCOxmJ05j5SV5VxnUkrzQwjXUZ7KvqP7/XXdx3s+9evArSml47o/vq778W+HEL6ZUprV47kXpZS+DxBCuB6YTrmu9cR+xjoa2AnYMaU0s/t1bgIeBj5HWdjVV952czlweffSoXsD7wXGA7sD/5IxXaOZRXm2onxzXWupF8u5Pl0K/CCE8FnKGcmn9HwwhNAM7AGc2ufzLgO+C7wVuKLH8atW/CKl1BVC+D1w2ADyvAu4H3isz2nz24C9BvA61VEuWHF39xvEWgDeQPn7umf3+z2AjTMlrDeP0LuMH8ucR6o0y7k+/QG4iHIj+PUpt7rraSzQSrmOcE8rPh7T5/jzq/l48wHkGQvsy+pndD86gNeprnJW8Mzut8tWHq9tw8qyXvG+ureYDa4EPE555qXn2wxisTBjLqnuWM51KKW0OITwR+AzwBUppcV9njKHsig37XN8s+738/oc3xSY2+fjZwcQaR7l7PGTVvNYY0/gKUeAj1He3tN9rLYlvct6e+D1NM4tXJ3AP1m1hP9OLJbkDCY1Csu5fp1PuTvTBX0fSCl1hhDupzw1fX6Phw4HuoA/9/mUDwIzAEIITcD7gXsGkGUK5USqJ1NKfUfhw08sngSeBH7X+3htDGVJ93wb1+fj3KPuLmA+5Q9r87rfz6X3iPhhZ01Lg8tyrlMppVuBW9fwlK8C14cQfkF5jXpnytnaF/aZDAZwYghhOeWs5Y9RXls9cgBxLqacAX5rCOH7lKOqTYB9gOdSSucO4LUaVyzmURbey69mFWsjgNfRu7xXXKZo6fH2Sh/3PNZEuXjHiqKdt5pfr3g/3w0ipPws5waVUrohhDAROBP4COV15HMoS7uvicC5wDcpZ9EekVL6ywC+VlsIYQLlDPGvUZ4+f55y9P2Htfk+hp1yRLriVLmkYcq1tSVJqhhXCJMkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIqxnKWJKliLGdJkirGcpYkqWIsZ0mSKsZyliSpYixnSZIq5v8BHvQh2KvZaeMAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "page_views_by_platform.plot.pie(y = \"count\", labels = [\"Desktop\", \"Mobile\", \"Tablet\"]\n",
    "                    , figsize = (8, 8), autopct = \"%.2f\", fontsize = 15)\n",
    "plt.title(\"Page views by platform\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Page views by traffic source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>traffic_source</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6668961</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1667170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1663868</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  count\n",
       "traffic_source         \n",
       "1               6668961\n",
       "2               1667170\n",
       "3               1663868"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_views_by_traffic_source = page_views.groupBy(\"traffic_source\").count().toPandas()\n",
    "page_views_by_traffic_source = page_views_by_traffic_source.set_index(\"traffic_source\").sort_index()\n",
    "page_views_by_traffic_source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,1,'Page views by traffic source')"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd4AAAHUCAYAAACUBUmlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3Xd8W9X9//HXR/JInD0hW+y9UmbYo0ArRgelpcuF8i2l0JZfS4vohAJFLdCyoWVDCy2lQAuihQIFyt6gEDZRSEhiO850vO3z++MqxElsx0PS0Xg/Hw89bEtX975l2fejc+6555pzDhEREcmNkO8AIiIipUSFV0REJIdUeEVERHJIhVdERCSHVHhFRERySIVXREQkh1R4RTbCzPY3s7c9bfsbZvakj213xwI3mdkyM3s+fd+pZlZjZg1mNi79dXPfWUXylQqveGdmKTNrSu+wa9I79uG+c63hnPufc24b3zkGy8weM7OTB7ma/YBPAlOdc3uaWTnwO+Bw59xw51x9+usHgw4sUqRUeCVfHO2cGw7MBPYAfuY5T8kxs7I+LDYDSDnnVqd/3gQYAryRtWAZ0sfXJ5J1KrySV5xzHwH/AnYEMLMTzexNM1tlZh+Y2SldlzezH5vZIjNbaGYnm5kzsy3Tj1Wa2cVm9mG6JX2tmQ1df5vp5Zab2Y5d7puQboVPNLODzGxBl8cmm9nfzazOzOaa2ffS9w9JP2d8+uefmVm7mY1M/3y+mV2a/v7TZjYn/bo+MrMze/m1mJldYWYrzOwtMzs0fecXzOyl9Rb8oZnd280KLgD2B65M9yxcmb7fmdlpZvYu8G76vsvMbL6ZrTSzl8xs//T93wSuB/ZJr+MOYE0X/HIze7TLOte8B0PN7BIzm5fO/2QP78F4M7s//T4sNbP/mVko/dh26db6cjN7w8yO6fK8dVrx63fN9/D6djCz/6S3U2NmP0nfHzKzmJm9b2b1ZnanmY3t5X0RGRAVXskrZjYN+DTwSvquWuAoYCRwIvB7M5uZXvZI4AfAYcCWwIHrre43wNbArunHpwC/WH+bzrkW4G7ghC53Hw887pyrXS9fCLgPeC29vkOBM8zsCOdcM/BClxwHAPOAfbv8/Hj6+xuAU5xzIwg+ZDzay69lL+ADYDzwS+DudEH4J7CZmW3XZdmvArd18xp/CvwPOD3dFXx6l4c/k97G9umfXyD4nY0Fbgf+ZmZDnHM3AN8Gnkmv4wRgh/RzRjvnDukm+8XAJ4BZ6fX9GOjsZrkfAguACQSt6J8ALt2VfR/wEDAR+C7wZzPrT9f/x6/PzEYADwP/BiYT/F08kl7ue+llD0w/tgy4qh/bEekb55xuunm9ASmgAVhOUKiuBob2sOy9wPfT398IXNjlsS0Bl/5qwGpgiy6P7wPM7WG9hwEfdPn5KeDr6e8PAhakv98L+HC9554N3JT+/jzgcqAMWAx8H4gTdMc2AePTy30InAKM3Mjv5hvAQsC63Pc88LX099cAF6S/34GgWFT2sK7HgJPXu88Bh2wkwzJgly55nuzyWCS9jrL11rklwQf7pjXP3cg2fgX8A9hyvfv3T/8eQ13uuwM4p7vX1E2+dV4fwYerV3rI8CZwaJefJwFtXV+bbrpl4qYWr+SLzzjnRjvnZjjnvuOcawIws0+Z2bPpbsHlBK3h8ennTAbmd1lH1+8nAFXAS+kuyuUErZwJPWz/UWCome1lZjMIWnz3dLPcDGDymnWm1/sTglYaBC3agwiOVSeB/xC0oPYG3nPOLUkv9/n0a5lnZo+b2T69/G4+cs51vZrJvPRrB7gF+LKZGfA14E4XtOD7o+vvbU139ZvpruHlwCjW/s77YzzBB473+7DsRcB7wEMWHFKIpe+fDMx3znVtJc8j6G3oq66vb1oveWYA93R5X98EOlj73opkhAqv5C0zqwT+TtBduYlzbjTwAEFrFmARMLXLU6Z1+X4JQWtrh3RBH+2cG+WCAVwbSO/Y7yRoEX0ZuN85t6qbRecTtJpHd7mNcM59Ov3408A2wGcJuqrnANOBKGu7mXHOveCcO5ag+/Te9LZ7MiVdWNeYTtAKxjn3LNBK0DL8Mt10M3d9mRu7P3089yyCrvYx6d/5Ctb+zvtjCdAMbLGxBZ1zq5xzP3TObQ4cDfwgfSx7ITBtzfHetOnAR+nvVxN8wFpj0+5W3+X7+b3kmQ98ar33dogLxh2IZIwKr+SzCqASqAPazexTwOFdHr8TODE9+KaKLsdv04X0OoJjwhMBzGyKmR3Ry/ZuB74IfCX9fXeeB1aa2VnpgUNhM9vRzPZIb7cReAk4jbWF9mmCbuXH0zkqzOwrZjbKOdcGrCRoWfVkIvA9Mys3sy8A2xF8AFnjVuBKoN0519s5vzXAxs6vHQG0E/zOy8zsFwTH1/st/R7cCPzOggFpYTPbJ/2Bah1mdpSZbZn+gLHm99EBPEdQXH+cfv0HERTmv6Sf+irwOTOrSg/o+uZGYt0PbGpmZ1gwqG6Eme2Vfuxa4IJ0j8eaAXbHDuS1i/RGhVfyVrrF+T2CAruMoEX3zy6P/4vgeOp/Cbopn0k/tKar9az0/c+a2UqCQTU9Dspxzq3ZyU8mGFnd3TIdBDv+XYG5BK266wm6Y9d4HCgnKNJrfh4BPNFlma8BqXSubxMMiurJc8BW6W1dABznnKvv8vhtBAO0emvtAlwGHGfB5BeX97DMgwSv/R2CLt1m1uuK7qczCbrcXwCWEgx4626/sxXB+9NA8D5e7Zx7zDnXChwDfIrg9V9NcOz9rfTzfk/Q4q8h6Hb/c29h0n9TnyR4DxcTjHQ+OP3wZQR/Xw+Z2SrgWYJj+iIZZeseOhIpXOnRvbMJBhe1+86TK+nTc2qBmc65d33nEZHeqcUrBc3MPpvuuh1D0Jq6r5SKbtqpwAsquiKFQTO5SKE7BbiZ4Hjg48B3vKbJMTNLEQx8+oznKCLSR+pqFhERySF1NYuIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOqfCKiIjkkAqviIhIDqnwioiI5JAKr4iISA6p8IqIiOSQCq+IiEgOlfkOICL9F4klKgj+f8Pr3QzoBDrStzXft6Ti0Q4/aUWkK3PO+c4gUrIisUQImAxEgInAaGBM+jZ6va9dv68cwOYagOXAsvTX5b38/BEwNxWPLhnYKxORnqjw5oCZnQOc7pwb34/nHA9UOeduzlauwTCzbwA3ASOccw2e4+S1SCwxDtgc2Kyb2wygwl+6jWoAUsDcLrePf07Foyu9JRMpUOpqzl/HA+OBmz3nkD6KxBKjgZldbjsQFNcRPnMN0nBgx/RtA5FYYinwHvAa8ArwMvB6Kh5tyllCkQKjwlsCzMyASudcs+8sxSISS2zCukV2JkF3cakZC+yZvq3REYkl3mJtIX4FeCUVj67wkE8k76jw5piZHQT8FzgYOA34FFALXOycuzq9zM3A59PfrzkWcK5z7pz0fccCPydohSwHbgV+6pxrSz9+DnA68Bng98DOwMlmNn9j204/fx/gbGB3YBTwLnCRc+7PGf51FIRILFEG7AUckv46E5jkNVR+CxO09ncAvpq+z0ViibkEhfhJ4JFUPDrbUz4Rr1R4/bkOuAX4I3ACcJWZveicex44D5hOMJDmO+nlF8DHx37vAP4A/ATYAriQ4NSwM7usvyq9/t8C7wALWVssets2BMcdnwKuBZqBfYGbzKzTOXdHBn8HeSkSSxjBh5VD07cDCLpcZeCM4Dj35sBxAJFYohZ4FHiEoBDP9RdPJHdUeP25wzl3PoCZPQYcDXwOeN45976ZLQVCzrln1zwh3WV8EXCrc+47Xe5vISieFzrn6tN3DwV+4Jz7R5fl1hTeHrcN4Jz7y3rbfAKYCvwfQdEvOpFYYgvWFtqDgQl+E5WEicCX0jcisUSKdBEGHk3FozX+oolkjwqvPw+t+cY512Zm7xIUt95sTdASvtPMur53jwJDCLqeH1+zWuBfA9m2mY0BzgWOBaYQdB1CcIpJUYjEEuXAYcBngU9Smsdn800E+Gb6RiSWeAN4ALgrFY8+38vzRAqKCq8/y9f7uZWgePZmzelID/Tw+LQu3y9zzrUOcNs3A3sTdHnPAVYCpxIU4oKVnnTiMOALBK9ljN9EshFrjhP/KBJLfAjclb49m4pHdR6kFCwV3sKyNP31WwQjRdfX9RjZgHZMZjYEiBKcd3xtl/sLcnrRdLE9nKDYHkNw3FwKz3TgB+nbgkgs8XeCIvyUirAUGhXe/NVdC/htgu7eiHPuuixtt5Kga7llzR1mNoKgaBXEDi4SS1QCRxAM4jmGYGS2FI+pwPfTt4WRWOJu4G/Ak6l4tNNrMpE+UOHNX28Bx5rZZwhGNC90zi00sx8Ct5nZSIJjuK0EI0U/AxznnGsczEadcyvM7AXgF2a2kmCu3xiwAhg5mHVnWySW2BE4heAUFrVsS8NkglPnTgc+jMQS1wM3pOLRhX5jifRMhTd/XQ3sBtxIcCzyXOAc59xf0wXxJ8BJBBPgfwDcT1CEM+HLBKca3QrUA1cSnJ50eobWnzGRWGIIQTfyKQSnPUnpmg78CvhFJJa4n+B0uIfUFS35RnM1S0GKxBLbEBTbaoLZk0S6M5fgvPUbdXqS5AsVXikY6YFSnyMouAf5TSMFpg34B8HEM4+oFSw+qfBK3ovEEhOAMwgm8NDEFjJY7wGXAden4lHNXy45p8IreSsSS0wBfkRQcKs8x5HiU0Mwl/nVqXh0le8wUjpUeCXvRGKJzQlGUleT39eqleKwDLgCuCwVjy7d2MIig6XCK3kjEktsTzBa+0usnaZSJFcaCEZCX5KKRxf7DiPFS4VXvIvEEp8AfkpwLrJ5jiPSDNwE/CYVj87zHUaKjwqveJMuuOcDR/rOItKNduA24JepeHS+7zBSPFR4JeciscQM4NcE1wJWC1fyXTPBKOgLU/HoCt9hpPCp8ErORGKJ0QRdyt8lmBNapJAsIeihuToVj7b5DiOFS4VXsi4SS5QB3wF+iWaZksL3PnBmKh6913cQKUwqvJJVkVjicOBSYDvfWUQy7BHgjFQ8Ott3ECksKrySFZFYYivgd8BRvrOIZFEHwTSUv0jFo/W+w0hhUOGVjErPp/wz4Cw0+YWUjqXAD1Lx6C2+g0j+U+GVjInEEnsBNwA7+M4i4sm/gVNS8eiHvoNI/lLhlUGLxBJVwHkEFzIIeY4j4tsq4GyC0c/awcoGVHhlUCKxxEHA9cAWnqOI5Jv/ASen4tF3fAeR/KLCKwMSiSVGAhcRXDlIk2CIdK8ZOAe4OBWPdnjOInlChVf6LRJLRAkmk5/qO4tIgXgJ+GYqHn3NdxDxT4VX+izdyr0S+JrvLCIFqJ1g5qvzUvFop+8w4o8Kr/RJJJaYCdyJjuWKDNZjwJdT8egi30HED41AlY2KxBKnAU+joiuSCQcBr0ZiiU/6DiJ+qMUrPUp3Ld8AHOc7i0gR6gQuJLjsoAZelRAVXulW+lq5f0WtXJFsewI4IRWPLvQdRHJDXc2ygUgs8V3UtSySKwcQdD0f4TuI5IZavPKxSCwxiqBr+fO+s4iUIAfEgZ+r67m4qfAKAJFYYifgXmBz31lEStwTwOdT8egS30EkO9TVLERiiU8BT6GiK5IPDgCejcQS2/gOItmhwlviIrHEd4D7gBG+s4jIx7YAnonEEgf7DiKZp67mEhWJJULAJQRXFBKR/NRGcJnBm3wHkcxR4S1BkVhiGHA7cIzvLCLSJxcCP9VlBouDCm+JicQSk4D7gZm+s4hIv/wN+HoqHm32HUQGR4W3hERiiZ0Jiu4031lEZECeA45JxaO1voPIwGlwVYlIj1x+EhVdkUK2F/BcJJbY3ncQGTgV3hIQiSVOQCOXRYpFBHgiEkvs5juIDIwKb5GLxBJfB24Dwr6ziEjGjAMeicQSe/gOIv2nwlvEIrHEN4GbUNEVKUZjgIcjscQs30Gkf1R4i1QkljgVuA69xyLFbCTwYCSWONB3EOk77ZSLUCSW+D5wNWC+s4hI1g0HHojEEof6DiJ9o8JbZCKxxI+AS33nEJGcqgLuT5+9IHlOhbeIRGKJnwK/9Z1DRLwYAtwbiSU0I12eU+EtEpFY4lzgfN85RMSrCuCuSCxxnO8g0jPNXFUEIrHEWQQX0BYRgeDiCken4tEHfQeRDanwFrhILFEN3Ow7h4jknQbg4FQ8+qLvILIuFd4Clh5I8U+gzHcWEclLdcC+qXj0Xd9BZC0V3gIViSX2BB4FhvnOIiJ5bS6wTyoerfEdRAIaXFWAIrHEVkACFV0R2bjNgH9FYgnN1Z4nVHgLTCSW2BR4EBjvO4uIFIzdgHsisUSF7yCiwltQIrHESOBfBJ9gRUT641Dg1kgsoRntPFPhLRDpT6r3ALv6ziIiBeuLwO99hyh1KryF42bgEN8hRKTgfT8SS/zAd4hSplHNBSA9/7KmghSRTOkADk/Fo4/6DlKKVHjzXCSWOAR4CF1TV0QyawnwiVQ8+qHvIKVGhTePRWKJacBLwATfWUSkKL0I7JeKR1t8ByklOsabpyKxRCXwd1R0RSR7die4drfkkApv/roC2MN3CBEpeidFYolv+Q5RStTVnIciscTJwHW+c4hIyWgFDkjFo8/5DlIKVHjzTCSW2AP4H1DpO4uIlJQFBIOtan0HKXbqas4jkVhiPHAXKroikntTgb9GYgld7SzLVHjzRHoatzuA6b6ziEjJOgi4wHeIYqfCmz/+H3CY7xAiUvLOjMQSB/gOUcx0jDcPRGKJ7QnO1x3iO4uICME1fHdJxaOrfAcpRmrxepY+nnIrKroikj82QxdTyBoVXv9+BnzCdwgRkfV8MxJLHOU7RDFSV7NHkVhid+AZQKMI+8B1drDy+btpeP0h2lfWER46iqpt92Psof+3znKtdSmWP34LzfPfABzl46Yy9vDTqNx0y17X39G0kuWP30rje8/iWhoJj5zIqH2+wPAdD+2y7nkse/R6WhbMwcorqdpmX8YcfBKhiqHZeMkivi0GdkzFo/W+gxQT7fA9icQSQwi6mPUe9FH9A5fSPO81Ru17AuVjp9K+agltS9ad37215gMW334WVVvuxYRjzwKgZdE7uPbep6LtbGmk5s8xrGIIYw/7NuGhI2mr/xDX0d5lmdXU/OWnlI+dzPhjf0xn0yqWPXYTHauXMfFzP8v8Cxbxb1PgWuALvoMUE+30/bkA2M53iELR9MFLrH7zCSadeAUV43s+46r+wasYusWejD/6zI/vG7r5xnvyVzxzJ66jjU1P+D2h8uA06iEzdl5nmVUvJ3DtLUz8/C8IDRkOQGjICOruPo+WRe9SOWmrgbw0kXx3XCSW+EoqHv2z7yDFQsd4PYjEEgcCZ/jOUUgaXv8PQ6bv3GvRbV3yIa2L3mbkJ/p/WKoh+TDDd/7kx0W32/XXzqVi060+LroAQzfbDTCa3n+h39sUKSBXRmKJKb5DFAu1eHMsEkuMAG5GH3r6pWXR21RtuRdL/3MNDbMfhc5Ohmw+k7GHfZuyEeMAaF34NgCdzQ0svPF02pZ8SNmoiYzc+3hG7HJ4j+tuW76YzsblhIYMp+Zvv6Q59RqhyiqG7XAwYw76BhYuB8C1t2Lh9f5lQmEwo61+fnZeuEh+GA3cCBzhO0gx0M4/934NRHyHKDQdq5fRMPsRWmvmMuGYHzPu09+ndfF71N1zAWsGCHasXgbAksTvGbbDQWzyxfMZstknWPrvy3ttkXamn7fssZsoGz6Oicefy6h9jqfh1X+x/InbPl6ubMwk2mrnrnPct3Xxe+A66WxuyMbLFsknh0diia/7DlEMVHhzKBJLzARO9Z2jIDnAOSZ8/mcM3WIPhm13AOOP+iGti96hed5r6WWCAjx858MZtddxDJmxM+MOP5XK6Tuz4tm/9bzq9PPKx01n3Ke+x9AZuzByj88wcu8vsPKl++hsawZgxC5H0NG4gqUPX0tHwzJa6+ax9KGrwULBTaT4XRSJJUb7DlHotLfIkfRczFcDYd9ZClFoyHDKJ8wgPHTkx/dVTt0ewmUfd/OGho4AYMj0ndZ57pAZO9O2pOeu4PCQ9PNmbPg8OtpoX74YgPJx0xh35OmsnvMEC676Gotu+i4Vk7amYpPNCQ/TvkhKwkSCXjsZBB3jzZ2Tgb18hyhU5eOm4jraNnzAgZl9vEy3nIP0Mt0pG7MprH/sds3zAFj73OE7H86w7Q+ibdlCwlWjCA0dyfzLv8zwnXs+hixSZE6JxBI3puLRF30HKVRq8eZAJJYYB1zoO0chG7rFnrTVpehoXPHxfS3zZ0NnO+UTNwOgcsp2hIYMX9v1nNY87zUq0st0x8LlDI3sRvO81zd4npVXUj5m8rrLl1VQMSFCeNgYVr/xGLhOqrbdf5CvUKRghIBrIrGE6scA6ReXGxcA43yHKGQjdj2S0JCR1P79VzS+9xyr5zzGkvt/x5AZuzJk6g5AUEBHzTqBlS/+kxXP3EnT3Feof/BKWua/wah9T/h4XQ2zH2Heb4+hfcXa632PmvUlWms+YEniUprmvsyK5+5mxbN3MWrv47GyYFRzZ0sjyx67icb3X6Dpg5dY9tjN1P/7csYedgrhdDe3SInYHTjJd4hCpSkjsywSS+wCvIw+5Axa27KFLH34D7TMn42Fyhi61d6MOfT/CHc5rxZg5fP3sPLl++lYVU/52CmM3u8rVG0z6+PHG5IPU//ApUz59g2Ujdrk4/ubPniJ5U/cSuuSeYSrRjN8lyMYNeuLWHrgVGdrM3X3XEDr4ndx7a2Uj5/BqH2Op2rrfXLzCxDJL7XAVql4dKXvIIVGhTfLIrHEfwkuLi0iUmwuScWjZ258MelKhTeLIrHE54G7fOcQEcmSNoKLKLzjO0ghUfdnlqQvgnCR7xwiIllUDlziO0ShUeHNnlMJLiYtIlLMjorEEvv6DlFIVHizIBJLVAFn+c4hIpIjv/IdoJCo8GbHd4BNNrqUiEhxOCQSSxzgO0ShUOHNsEgsMQz4se8cIiI5plZvH6nwZt5pwATfIUREcuzASCxxiO8QhUCFN4MiscRw4Ee+c4iIeHKu7wCFQIU3s74LjPcdQkTEk/0iscQnfYfIdyq8GRKJJUYAmsFFREqdWr0bocKbOd8DxvoOISLi2T6RWOJI3yHymQpvBkRiiZHAD33nEBHJE2r19kKFNzNOBcb4DiEikif21AjnnqnwDlIklggTTJghIiJrfdd3gHylwjt4xwDTfYcQEckzR0diiRm+Q+QjFd7B06c6EZENhQkOw8l6dD3eQYjEEjsAs33nEBHJU/XA1FQ82uw7SD5Ri3dwTvcdQEQkj40Dvuw7RL5R4R2gSCwxCvia7xwiInlOh+PWo8I7cCcCw3yHEBHJc7tGYon9fIfIJyq8AxCJJQydQiQi0lc6LNeFCu/AHAls5TuEiEiB+HwklpjsO0S+UOEdmNN8BxARKSBlwCm+Q+QLFd5+isQSmxC0eEVEpO++4jtAvlDh7b8vEJwYLiIifbdFJJbY03eIfKDC238n+A4gIlKgtP9EM1f1SySWmA6kAPMcRUSkEC0imMmq03cQn9Ti7Z8voaIrIjJQk4CDfIfwTYW3f9RNIiIyOCW/H1VXcx9FYoltgTd95xARKXDLgE1T8Wir7yC+qMXbdyX/KU1EJAPGAEf4DuGTCm/ffcl3ABGRIlHSVyxSV3MfRGKJmcBLvnOIiBSJRmBiKh5d7TuID2rx9s0XfQcQESkiVcCnfYfwRYW3bz7lO4CISJEp2al3VXg3IhJLbArs5DuHiEiROdx3AF9UeDeuZP84RESyaGokltjedwgfVHg3rqSHvYuIZFFJNmxUeHsRiSUMOMx3DhGRIlWSDRsV3t7tCkz0HUJEpEgdEIklKn2HyDUV3t6VZDeIiEiOVAH7+w6Rayq8vVPhFRHJrpLbz6rw9iASS1QB+/nOISJS5EruOK8Kb88OAip8hxARKXI7pedLKBkqvD071HcAEZESYMAhvkPkkgpvz/byHUBEpETs4TtALqnwdiMSS4QJTiUSEZHs2913gFxS4e3edsAw3yFERErEbukGT0lQ4e1eSX36EhHxbBhBg6ckqPB2T4VXRCS3Sma/q8LbvZL5AxARyRMls99V4V1PJJYoA3bxnUNEpMSUzMhmFd4N7QgM8R1CRKTE7ByJJcp9h8gFFd4NlUx3h4hIHhlC0PApeiq8GyqZ7g4RkTxTEvtfFd4NzfQdQESkRO3mO0AuqPBuaGvfAUREStRWvgPkggpvF5FYYjww0ncOEZEStYXvALmgwruuzX0HEBEpYdNKYWSzCu+6SuLTlohIngoDM3yHyDYV3nWpxSsi4lfRN4BUeNelwisi4lfR74dVeNdV9J+0RETyXNHvh1V411X0n7RERPKcCm+piMQSlcAU3zlEREpc0TeAVHjXiqDfh4iIbyq8JaTo32wRkQIwPBJLTPQdIptUeNea5DuAiIgAoMJbIsb4DiAiIkCR749VeNcq6jdaRKSAjPUdIJtUeNdS4RURyQ9FvT9W4V2rqN9oEZECUtT7YxXetYr6jRYRKSDqajazR/pyX4FT4RURyQ9FvT8u6+1BMxsCVAHjzWwMYOmHRgKTs5wt14r6jRYRKSBFvT/utfACpwBnEBTZl1hbeFcCV2Uxlw9F/UaLiBSQou5q7rXwOucuAy4zs+86567IUSZfVHhFRPJDUe+PN9biBcA5d4WZzSKYz7isy/23ZilXTkViieH08XchIiJZp8JrZrcRXKrpVaAjfbcDiqLwAiN8BxARkY8N9x0gm/raytsd2N4557IZxiO1dkVE8keTiqjNAAAgAElEQVTYd4Bs6ut5vLOBTbMZxLOifpNFRApMUe+T+9rSGw/MMbPngZY1dzrnjslKqtwr6jdZRKTAFPU+ua+F95xshsgDmsFLRCR/qPA65x7PdhDPivpNltIRCq+oJdzS7DuHyCA1+Q6QTX0d1byKYBQzQAVQDqx2zo3MVjAR6Z+RNKz43Ygfvf//Nh2zN2a28WeI5K0GONF3hqzpUxerc26Ec25k+jYE+DxwZXaj5VTHxhcRyW/XVfzu1cOaG/bZurXtKd9ZRAapqPfJAzq26Zy7Fzgkw1l8avcdQGQwdrb3393T3toP4Nqa2m1wboXvTCKDUNSFt69dzZ/r8mOI4LzeYjqnt6jfZCl+t1T8ZrVZMFZhQkfnhC+tanjiLyNHHOA7l8gAFfU+ua+jmo/u8n07kAKOzXgaf4r6TZbi9s3wA0+PsYZZXe87q37ZrHuGD3u3JRTaylcukUEo6n1yX0c1F+9R7oC6mqUgVdG8+idlt2+2/v1lUHZxXf3q724ywUcskcEq6n1yn47xmtlUM7vHzGrNrMbM/m5mU7MdLodW+Q4gMhCXl1/xQtg6J3X32EGNTbtGWtueznUmkQwo6jEKfR1cdRPwT4Lr8k4B7kvfVxRS8WgjXWbkEikEW9hH8w4NvTKrt2X+uLh2M5xryFUmkQxZ5jtANvW18E5wzt3knGtP324Giq0Pq6jfaCk+f674da0ZFb0tM6mjY9JnG1a/lKtMIhlS1PvjvhbeJWb2VTMLp29fBeqzGcyDon6jpbgcF378+U1t2R59WfbnS5bOKndubrYziWTQUt8Bsqmvhfck4HhgMbAIOI7im1akqN9oKR4VtLVcWHb9Jn1dvhzKf11Xr79vKSRF3RDqa+E9D6h2zk1wzk0kKMTnZC2VH0X9RkvxiJdf90y5dczoz3OOXN34ialtbc9mK5NIhhX1/rivhXdn59zHvwjn3FJgt+xE8kYtAsl7U6hb9NnQk33qYl7fdYtrp+FcUU8+L0VDhRcImdmYNT+Y2Vj6PvlGoSjqN1qKw20VF6bMGDaQ505t75jy6dWNz2U6k0gWFHVDqK/F8xLgaTO7i2CqyOOBC7KWyo+ifqOl8B0eeuGVzUOL9xnMOs6rq9/noWFV89rN+tVVLZJjRd0Q6uvViW4luCJRDVAHfM45d1s2g3mgwit5K0xH++XlVw4f7HoqoPKXS5bWZiKTSBYVdeHtc3exc24OMCeLWXwr6jdaCtvPyv701BBrOzAT6/pMw+o9rhwz6vmasrI9M7E+kSwo6v3xgC4LWKQW+w4g0p1xrFjyjfCDu2Zyndcvqt0U5zRbm+Srj3wHyCYV3rXe9x1ApDu3VPzmLTNGZXKdkfb26Yc2Nun0IslHdcnqZFHPn6/Cu9Z8oM13CJGu9g698cYOlto3G+uO19XvGXZuQTbWLTIIRd8IUuFNS8WjHQTXGRbJE85dV/47zLBsrH2Ic0PPrl9W1F16UpA+8B0g21R411X0n7SkcJxR9vcnR1jTDtncxhdXNew1vr1DF1GQfFL0+2EV3nUV/RsuhWEkDSu+F75n21xs6w+La8fiXGsutiXSB0W/H1bhXVfRv+FSGP5Y8ftXQ+ZycunNrdvaNtuvqfmZXGxLpA/U1VxiVHjFu53sg3f3sjezMqCqJxfXLvlEyLlFudymSA+Kfj+swruuon/DJf/dUvGb1Wa5nQt9mHPDf7h0ua7ZK741EVx6tqip8K7rA4K5qEW8ODH8r2fG2qqMTpbRV19fuWrW6I6OV31sWyRtbrI6WfT7YBXeLlLxaEl82pL8VEXz6p+W/dnrxQuuXVw3DOfafWaQklYSvY4qvBt6x3cAKU2XlV/5Qpl1TvaZYYfW1q32bG55ymcGKWlv+Q6QCyq8G9I5jZJzm9vCeYeFXh7UJf8y5dKaut3MuTrfOaQkveg7QC6o8G6oJN54yS9/qvh1jRmVvnMAjHBu5GnLVqjnR3woif2vCu+GXvAdQErLZ0P/e2GyLc2rS/R9a8XKWSM6OpO+c0hJWZqsThb9ObygwruBVDz6PkV+LUjJHxW0tfym/I8TfedYn4FdU1NbgXOdvrNIySiJ1i6o8PakZP4AxK8Ly69/psI6vI5k7skuLa3b7NLS+qTvHFIySma/q8LbPXU3S9ZNoW7R50L/28N3jt5cVVO3szlX7zuHlAQV3hJXMn8A4s+tFfG5ZgzznaM3ozo7R5+8YuUc3zmkJJTMfleFt3tq8UpWHRp66dUtQotm+c7RF6cvW7HvsM5OFV/JpppkdXK+7xC5osLbjVQ8ugBY7DuHFKcQnR1Xll+R1y3drkIQurymzuFc0U/lJ96UTGsXVHh7U1J/CJI7Pyv705NDrXUr3zn6Y8/mlh22b23VjFaSLSW1v1Xh7dlzvgNI8RnHiiUnhv/t5SIIg3XN4rrtzLnlvnNIUSqp60Gr8PbsEd8BpPjcVPHbN80Y5TvHQIzt7Bz3tZWrXvedQ4pOC/CE7xC5pMLbs+cBfbqXjNnT3pyzk83N6QXuM+0HS5fvO6Sz823fOaSoPJmsTjb5DpFLKrw9SMWjHcDDvnNIsXDuhoqLnVlh/8+FIfz72iUtvnNIUXnId4BcK+idQA6U3B+EZMf3w3c/NcKadvCdIxP2a2reeUsNtJLMKbn9rApv7x70HUAK3whWr/h+2d3b+M6RSX9YXLcVzq30nUMKXg3wmu8QuWY6Na93kVjiTWBb3zmkcN1efv7js8JzDvSdI9POGzfm8TtHjijI19VS08KSfy2h6f0mmhc0U7V1FZufvfkGyzXPb6bmrhpWv7MaHFROqmRy9WSGRob2uv72hnZq7qph1cur6GjqoHxcOROOnsCYfcdssKzrdLx/7vs0z2tm+hnTGbnryIy9zgLwp2R18mu+Q+Rame8ABeBBVHhlgHawue/tE5pT0AOqenJ2/bJ9/zF82HstodCWvrP0V8tHLax6fRVVm1fh2rtvfDTNa2LuhXMZsdsIpp06LbhvbhOdrb1fsKmjqYO5v55LaEiISV+dRHhEmJaPWnrczrInltG+rH1wL6hwlVw3M6jw9sVDwPd9h5DCdGtFfJVZcf6flUHZb+rqV52xyQTfUfptxK4j2HZm8Hn6wys/pH3VhoVv4S0LGbHrCKadMm3t83YesdF1191Xh2t3bBbbjFBFcDRv+HbDu122Y3UHNX+vYZPjNmHhTQsH8lIKmQP+4zuEDzrGu3GPEZxnJtIv1eF/PzPOVu3mO0c2HdrYtNuMtraCm/zAQtbr480fNdP0QRPjDhvX73Uve3IZYw4Y83HR7U3N3TVUbVnF8O27L8xF7vVkdbIkp+ZV4d2IVDzaCOiapNIvQ2lp/HnZn/LyOruZ9sfFtRGcW+07RyY1fRCcVtqxuoP3fv4es0+azds/epuljy/t9Xmtda10rOwgVBUi9bsUb3zzDd787pssumMRne3rdlE3z29m2f+WsemXNs3a68hzJdnNDCq8ffUv3wGksFxaftXzZdY52XeOXJjc3jHp2IbVRTXXbvuKoOt5wXULGLXPKCI/ijBipxEsvGkhq15btdHn1dxZQ/nocmb8cAYTjprA0keXUvv32nWWXfinhYw7dByVm1Rm74Xkt4TvAL6o8PbN3wiOR4hs1Oa2cN7hoRf38Z0jl36xZOk+5c7N9Z0jU1xn8O8+5oAxTPj0BIZvN5zJX5/MsO2GUZeo6/l56bNEKidXMuWkKQzffjjjjxjPhKMmUP+fejpbglbv8meX07qolQnHFN7x8Qz5CPif7xC+qPD2QSoe/RDQhAHSJ7dVXFhjRkk1Yyqg4vy6+nrfOTIlPDwMwLDt1r1647DthtHyUc9DPsLDen6ea3e01rbi2h2L71zM+Oh46Ay6szubg4LsWhwdTR2ZfCn56q/J6mTvw8OLWFGOtsySO4D9fIeQ/PaZ0JMvTrH6PX3n8OHTqxt3v6yt/bmF5WV7+c4yWJWTevjc5Oi1uVIxsQIr62bg1pr+shB0tnTSvrSdxXcsZvEd644tmn/NfComVrD1b7ceUO4CcofvAD6p8PbdncBl6HcmPSinvfW35X8c7zuHT9ctrp0SnTqpCbPeZ5jIc1VbVREeFmb1nNWM2GntKUQNcxoYMm1Ij88LlYUYvsNwVr+57lizhjkNWIUFhTlkRM6KrPN4+4p2Fly7gE2O22SD1nIRei9ZnSyqMQH9pa7mPkrFo0vQRROkFxeUXf90hbVHfOfwaXp7+9QjVzfm/bWsO1s6WfHCCla8sIK2ZW10rOr4+OfOlk5CZSEmHDOB+ofqqb2vlobZDXx080c0vtPIxGMnfryeZU8tY/ZJs2ld0vrxfROOnUDzvGYWXL+AVbNXseRfS1iSWMKEoyYQKg9hYWP4dsPXuVVtUQVA5dTKj78vYiXd2gW13vrrduBI3yEk/0xmyaIvhJ/Y3XeOfHD+kvq9Hx5W9WG72XTfWXrSvrKd+VfNX+e+NT9vfdHWVEyoYPwR48FB/cP11N1bR8WkCqadNo1h23RpkXamb12GXlZtXsX0M6ZTc1cNK55dQXhkmAlHT2DCUSU7kGp9JV94NVdzP0RiiREEk3oXdDeaZN7DFWc+vWVo4SzfOfLF3cOHPf/LCeNK8li39Oq1ZHVyV98hfFNXcz+k4tFVwP2+c0h+OST08msquuv6XMPqPSe2t7/gO4fknZJv7YIK70DoD0c+FqKz46ryy9UD0o3rFtdOxDlNtyprOOAvvkPkAxXe/nsAWO47hOSHn5Td/tRQay36cz8GYvO29hkHNTY96zuH5I1nktXJeb5D5AMV3n5KxaMtwF995xD/xrKi/qTwAzv7zpHPfltXv0fYuY9855C8cKPvAPlChXdgrvQdQPy7qeKiOSFjtO8c+Wyoc1Vn1S+bv/ElpcgtJTgrRFDhHZBUPDqb4HKBUqL2sLfe3Nk+KMoL3GfaCasa9h7b0fGy7xzi1Q3J6mST7xD5QoV34K7wHUB8ce6Gios6zPT/01d/XFQ7GufafOcQLzqBq32HyCfacQzcP4APfYeQ3Ds9fO9TI61pR985Csk2bW2bz2pqftp3DvHivmR1MuU7RD5R4R2gVDzaAVzjO4fk1nAaV/6/sru28Z2jEF1Su2RmyLnFG19SiozGxKxHhXdwrgOafYeQ3PlD+e9fCZvT3H8DMNy5EWcsW/6+7xySU28mq5Oa4349KryDkIpH69GEGiVje0u9Pyv0hgZUDcKJK1btO6qj4zXfOSRn1Nrthgrv4F3uO4Dkxq0V8RVmurDIYF27uK4K50riau8lbiVwq+8Q+UiFd5BS8eirwJO+c0h2fS380LPjbeVM3zmKwY6trVvt3tyi/5nid3OyOtngO0Q+UuHNDLV6i9gQWpp+WXbrNN85islltXW7mnN1vnNI1nSgbuYeqfBmxt3AW75DSHZcWn71c2XWOcV3jmIystONOnX5ird955CsuSNZnXzXd4h8pcKbAelTi37lO4dkXsQWzT8i9MLevnMUo28vX7nv8M7O2b5zSMZpf7gRKryZ81fgDd8hJLP+VHHhQjOG+M5RjAzsqsW1YZzr9J1FMurPau32ToU3Q1LxaCdwru8ckjlHh55+caot2ct3jmI2s6V1u51aWp/ynUMypgM4z3eIfKfCm1l3AUnfIWTwymlvvaT82nG+c5SCq2vqdjTnlvrOIRlxW7I6+Z7vEPlOhTeDUvGoA87xnUMG7/yyG56usPbNfOcoBaM7O8ectGKlDtMUvnbU2u0TFd7Muwd4xXcIGbhJ1C8+Pvz47r5zlJLvLVuxb1Vn55u+c8ig3JKsTn7gO0QhUOHNMLV6C98tFfH3zRjuO0cpCUHospq6DpxzvrPIgLQB5/sOUShUeLMgFY/+E3jRdw7pv4NDr7y2degjzcfswd7NLTtu29qmgVaF6WZd+q/vVHiz55e+A0j/hOjsuKr8Mp065NE1NbXb4NwK3zmkX1qBC3yHKCQqvFmSikcfAB7znUP6LlZ2+1NV1qpr7Xo0vqNzwldWrnrVdw7pl8uS1cl5vkMUEhXe7Po+wXltkufGsHLpyeEHdvKdQ+BHS5fvN6Sz8x3fOaRPatBI5n5T4c2iVDz6OnCd7xyycTdWXDQ7ZIzxnUMgDOFLapc0+c4hffKTZHVyle8QhUaFN/t+DizzHUJ6tru9/eau9v5+vnPIWgc0Ne+yhQZa5buXgJt9hyhEKrxZlopHl6CpJPOYczdWXNRupv+FfPOHxbVb4pxaU/nrjGR1UvNsD4B2NrlxFZpKMi+dFv7HUyOtUcd289AmHR2bHLeq4WXfOaRbtyark0/6DlGoVHhzIBWPtgOnApocII8Mp3HlD8r+trXvHNKzn9Yv27ei073vO4esYznwI98hCpkKb46k4tGn0PGQvHJN+WWvhM1N9J1DelYGZfG6JTqvN7/8LFmdrM3UyszsG2b2kpmtMrNlZvaKmf0uU+vvsp2bzaxfExuZWcTMnJkdlcksKry59WNAV2HJA9vZvPf3CyVn+c4hG/fJxqaZ09vanvGdQ4BgQNU1mVqZmZ0NXA88CHwO+DrwD+CYTG2ji/OAb2Rhvf1mmho1tyKxxP8Bf/Sdo9S9WPntl8fbypm+c0jffFQWXnjk1MmjMBvmO0sJ6wBmJauTz2dqhWb2EXCvc+609e43lwfFycwiwFzgaOfc/Zlar1q8uXc98KjvEKXsq+H/PKuiW1imtHdMPrqhUfOf+3VRJotu2mhg8fp3rl90zWy8md1iZvVm1mhmj5nZBlcQM7P/M7OkmTWbWY2Z3WVmo9KPrdPVbGaTzOxGM/vAzJrM7B0zO9/MKjL8Gjegwptj6asXfQPQcSsPhtDSdE7ZLVN955D+O2dJ/T7lzqV85yhRr5Od+edfBr5rZtVmNq6X5e4FjgDOBL5IULv+a2ZbrlnAzH4G/AF4HPgMwYDWFdDjlcbGExz6+wFwJHARcCJwxWBeUF+oq9mTSCzxdeAW3zlKzVXllz0WDT93kO8cMjCJYVUvxiaO17WSc6sV2DNZnXwt0ys2s50JiupmBGd9vAn8HbjYObcyvcyRwL+Ag5xzj6fvGwakgLudc6eY2WhgIXCtc+4HPWzrZmBH51y3fz9mVgYcD9wIjHTOtaqrucik4tFbgbt95yglEVs0/9Oh5/b2nUMGLrq6cfdJ7e2Z7u6U3p2TjaIL4Jx7HdiOYDDV1YARzPb3opmtaanuCdStKbrp560G7gfWzDi3DzAUuKmv27bAGWY2x8yaCK4p/GegEpg+qBe2ESq8fp1CMMm45MBt5RcuNEOX/Stw1y2qnYRzzb5zlIhngN9mcwPOuRbn3H3OudOdc9sDJwNbAd9MLzKJ7veTNcDY9PdruqkX9WPTZwCXAPcAxxIU+DWDvLK6n1Dh9Sg9neS3fOcoBUeFnnlpWmjJXr5zyODNaG+fdnhj03O+c5SARqA6WZ3M6RXWnHM3EBx73TZ91yKgu/PtN2Ht6Zn16a+T+rGpLwB/c8791Dn3kHPuBWD1ACL3mwqvZ6l49J/0o3tE+q+M9rZLyq8Zu/El/XhvaSen3NfELtc2EP7VSg66ufv//WRNB0fd3sio+EpGXLiSPa9r4KWFG98n1jcG69/04lUMvWAl217ZwK2vtX78+DmPNWPnruz2duH/WjL2OjPp13VL9ipzbr7vHEXurGR18t1sbsDMNiioZjYBGMXaVu5zwEQzO6DLMlVAFFgzbeUzQBNQ3Y/NDwXW/wP/Sj+eP2BludiIbNT3gYOBiOccRem8spuerrT2A33n6MkbtR088F47e08N09pDHX11cQf737SaY7cp56/HVQHwwkcdNLX3PjhyZYvjgJsbGV4BV3xqCOOrjDl1nets5+SZFRy55bq7gnvfauc3T7Xyqa3ycxdR6Rjy0yVLF547Ydw031mK1MMEc8xnW9LM/gE8BNQCMwhGLjeSHnzqnHvQzJ4C/mpmMYLW7ZkEhfOi9DLLzew84IL06UAPEByrjQLnOuc+6mbb/wG+Z2bPAe8TFN0tu1ku4zSqOU9EYokDgf8SDC6QDNmUpTXPVJ5eZcYI31l60ukcIQve9uPubGRJo+Oxb6w7T8Te169m8zHG7Z+v6te6Yw83c9ecNpKnDmdoed//tKK3N/LBsk7ePK2nMzHywyHTJr9YV1amUc6ZtQLYMVmdXJDtDZnZaQTHV3ckOF67GHga+JVz7q0uy00gOB57NMHx1+eBM9Pdw13XdwpBQ2YLgsuxPgGc7Jxbuf6o5vTgrSvS24dgsOu9wH3ATs652dka1azCm0cisUQcOMt3jmLyYMWPn9omtGBf3zn6qrvCO6eugx2uXs3TJ1Wxz7T+tUA3uXgVZ+xVwdn7V/b5OUubHJtevIqfHVDJLw7s+/N8eL+8LPWZKZMmk4NJD0rIF5LVybt8hyhmOsabX36KZrXKmANCr71eSEW3J88tCPqFlzU7drm2gbJfrWSLy1dxw8utvT5v7rJOalc7Rg8xPv3nRirOW8mEi1bxgwebae3o+QP3XXPaaOuEL+2Yn93MXW3R1h45sKnpad85ishFKrrZp8KbR1LxaAfBrCwf+s5S6IzOzmvLLy2KVtDihqBIfv2eZr6yUzn/+VoVR25Rxsn3NfPAu229PC+4RvmPH25mygjj31+t4if7VXDNi6387NGeB039ZXYbMyeF2HpcOLMvJEsuqq3fM+zcQt85isCjwNm+Q5QCFd48kz7F6POAzlMchLPK/vJklbVsu/El819nunF68sxyfrxvJQdvVsZV0aEcHAlz4ZM9t3rXPG+HCWGuO2Yoh2xWxv/bp5Kz96vk8udaaWzbsNW7aFUnj8/r4IQdy7PxUrJiqHNVZy5dNs93jgI3H/hSrk8dKlUqvHkoFY++CHzHd45CNYaVy74Vvn8n3zkyZezQYFDUwZF1u34P2ayMOXWdfXjeui3XQzYL09IB7y/d8Ll3vtGGc/DFHQqn8AJ8dWXDPmM7Ol72naNAtQCfT1Yn63wHKRUqvHkqFY/eRDDht/TTDRUXJ0PGGN85MmW7Cd3/mzoHoV4GKm8xNkRFN73Fa8ZTdvfcv7zRzn7Tw0wbVXi7hj8srh2Fcz33vUtPvpusTr6w8cUkUwrvv6u0fA941neIQjLT3nlrN3tvv40vWThmTQszZgg8Mrd9nfsfmdvOLpv0/C9cETY+uXkZj6Y2fF5VOWw5dt3nppZ38uyCwupm7mrb1rYt9m5u1kCr/rk+WZ28zneIUqPCm8dS8WgrwfFezefcRzdV/LbVrLD+rhvbHHfNaeOuOW18tMpR17j258Y2R0XY+MWBlVz6bCu//l8L/3m/nW/f38QT8zr4ZZfTfW59rZWyX61k3vK1Xci/OLCSVxZ1cuI/mnjo/XYufrqF+JOt/GS/SirL1m3y/mV2G2UhOG77/B/N3JPf1yyZGXJO/y998wJwuu8Qpahw/8NKRCoeXRiJJb5AMOJQ71cvTg3/46lR1lhwpw/VrnZ84W9N69y35ue53x9OZLRxxt6VdDq44vlWznnMsc34EHcdP5T9Z6z9k+h00OGCa6utseeUMPedUMXZjzRze7KNicOMn+5fydn7bzjg+y+z2zh0szAThhXU55Z1DHduxPeWrUheOnb0Jr6z5LklwHHJ6mR+zgla5DSBRoGIxBKnElw2S7oxjKZVr1ee3BQ2191k6lJi9p0+5bWV4fAuvnPkqWbg8GR18n++g5Sqwv1oW2JS8eg1wK9958hX15Rf+rKKrqxx7eK6oTinU2M21Al8WUXXLxXeApKKR38K3Og7R77Z1j78YP9QcpbvHJI/dmpt3XpmS8tTvnPkodOS1cl7fIcodSq8hedbQMYm6y4Gt1ZcuMyMwhyKK1lzeU3dLubcEt858sj5yerktb5DiApvwUlPK3k8wfUnS96Xww8/N9FWfMJ3Dsk/ozrdqFOWr3zTd448cUOyOvlz3yEkoMJbgFLxaBNwFFDSO5UhtDSdW3bLFN85JH99Z/mK/YZ1dr7hO4dn9wGn+A4ha6nwFqhUPLoUOALo7gLPJeHi8mufK7eOqb5zSP4ysCtr6gznep5bs7g9A3xRczDnFxXeApaKR+cDRwLLfWfJtelWsyAaem4v3zkk/+3e3LL9jq2tpTjQ6i3g6GR1smmjS0pOqfAWuFQ8Ohs4mhK7mtGfyi9cYMZQ3zmkMFy9uG4Hc26Z7xw5tAA4MlmdrPcdRDakwlsEUvHok8AxQEl8so2Gnn1peqh2b985pHCM6ewcW71i1WzfOXJkHnBgsjqpSyXmKc1cVUQiscRBBKcaDfMcJWvKaG97o/Kk+ZXWvrnvLFJYOqFz7xlT324KhbbznSWLPgAOUdHNb2rxFpFUPPoYwYCrlZ6jZM2vym5+WkVXBiIEoUtrlrRTvK2Nd1FLtyCo8BaZVDz6FPBJinDA1SYsrT0h/OhM3zmkcM1qbt5p69a2Yrx04FsERXeB7yCycSq8RSgVjz4PHAIU1cCKWyp+844ZI3znkML2h5rarXFuhe8cGTSboOgu8h1E+kaFt0il4tFXgIOBWt9ZMmH/0OvJbWx+wV3yT/LP+I7OCSesbHjNd44MeQ04OFmdLIr/81KhwVVFLhJLbAc8AkzynWWgjM7ONyq/+U6VtWzrO4sUhw7o2GvG1A9aQqGtfGcZhJcILu+31HcQ6R+1eItcKh59EzgAmO87y0D9uOyvT6roSiaFIXxJ7ZJG3zkG4RngUBXdwqTCWwJS8eh7wCyCbqmCMppVy04J37+j7xxSfA5sat5ls8IcaPU3glOGiuk4dUlR4S0RqXh0AbAfkPCdpT+ur7gkGTI31ncOKU5/XFy7Oc41+M7RD3GCuZdLaqa6YqPCW0JS8WgDcCxwue8sfbGbvfv2J+wdDaiSrNm0o2PTzzWsftF3jj5oA05OVifPTlYnNTCnwGlwVYmKxBKnAZcBYd9ZevJq5f+9PtpW7+w7hxS3NmjbO8j2j9AAABPGSURBVDJtfqtZvk7Mshw4LlmdfMR3EMkMtXhLVCoevYrg4gqrfGfpzinh+55S0ZVcKIfyC+rq8/UCCnOBWSq6xUUt3hIXiSV2IpjfebrvLGsMo6nh9cqTG8LmNvWdRUrHp6ZOenZBeXk+XXzjWeBYnaNbfNTiLXGpeDQJ7AW84DvLGleVX/aSiq7k2nWLa6fhXL6cYvQ3NDFG0VLhFVLx6GLgQOAO31m2sQ/nHhh6fZbvHFJ6prZ3TPn06kbfH0A7gJ+jkctFTV3Nso5ILPEtgkFXQ3xs//nK77w40Zbv7mPbIq3Qsldk2uJ2sxkeNr8IOCFZnXzcw7Ylh9TilXWk4tE/EnQ9v53rbX8p/OhzKrriUwVUnruk3kf37n+AXVV0S4MKr2wgFY++DuwO/ClX26yktfm8spsm52p7Ij05pqFxj03b23PV5byma/lIHc8tHepqll5FYomTgCuAqmxu5/LyKx4/JvzMgdnchkhfpcrKPjx66qSJmGXzkIu6lkuUWrzSq1Q8eiOwJzAnW9uYbjULjg49s2e21i/SX5H29umHNTY9l8VNqGu5hKnwykal4tE3gD2Am7Ox/tvKL1xgxtBsrFtkoC6sq98z7NyCDK9WXcuirmbpn0gs8RWCuZ4zcuGCI0PPvXxtxWUzM7EukUy7c8TwZ88bPzZTk2q8BZyUrE4+k6H1SYFSi1f6JRWP/hnYHrhrsOsK09F+afnVowafSiQ7jl/VsPf49o7BXkShHbiQoGtZRVfU4pWBi8QSnwWuBgY0y9T5ZTc8/tWyRzSgSvLae+Xlcz87ZdMpmFUM4OmvAScmq5OvZDqXFC61eGXAUvHoPcB2wE39fe4mLK39SviR3TKfSiSztmxr22z/puan+/m0VoJjuXuo6Mr61OKVjIjEEocBfwQ268vy/6qIPbld6MP9sptKJDMazVbvM2Pqyk6zSX1Y/DmCY7lZOxNACptavJIRqXj0YWAngukmO3tbdr9QMrmtfagL3EvBqHJu2A+XLk9tZLFG4AcEl/FT0ZUeqcUrGReJJfYGrgN2XP8xo7NzduXJbw+z5u1yn0xkcPafPuXV5eHwrt089G/g9GR18v1cZ5LCoxavZFwqHn0W2BU4HVja9bEzy/72lIquFKo/LK4djnPtXe56FzgqWZ38lIqu9JVavJJVkVhiLHAOcOooGhpeqTylI2RunOdYIgP2zU0nPv780CG7AecBlyerk62+M0lhUeGVnIjEEtvfUH7R6YeGXznVdxaRQeioD4WuOmjG1F8nq5M1vsNIYVLhldw6Z9RhwG8BnUokheZ+4CzOWaGBUzIoKrySe+eMMuArwPmAjwuOi/THS8CZnLPiMd9BpDio8Io/54yqBL4D/Ajoy/mRIrn0KnAB8HfOWaEdpWSMCq/4FxTgE4Ef08cJOESy6BngfM5Z8YDvIFKcVHglf5wzqgw4AYgRXIhBJJceJSi4//UdRP5/e3ccFWW55wH8+5tBYVCY0gZFXB0zQVScy8HsWCpGt6Kr964nddPsRG2WW7uStZbeanOWTna3si07mZw8rdXprt6bpTc6py66BNbZqyspgdDgJTVTUAwaBoFhhnn2j3cwGkHpBu8LM9/POe+ZYebhnd8758jX53mf933CG4OXBh7tHPBCAE8AmGFwNRT+PoIWuH8xuhCKDAxeGtic1lugBTBXMaK+FADwPoBn4XQfNroYiiwMXhocnNYbAKwG8PcAhhhcDQ1e3wN4G8DrcLq/MroYikwMXhpcnNZR0CZi3Q/gaoOrocHjAIAtALbD6W41uhiKbAxeGpy088A3A3gAWi84ytiCaABqBvAugC0cTqaBhMFLg5/TOhrAP0LrBduNLYYGgMPQere/h9PtMboYolAMXgofTqsJwC3QesHzAQw1tiDSUSO0yVJvwOneb3QxRJfC4KXw5LRaAfwawGIAtwKIMbYg6gcNAHYB+COAvXC6fQbXQ9QrugVvaWlpQlRU1FZoi6NzHeDuBQBU+P3+FRkZGWeNLiZsOK1xABZAC+HbAFiMLYh+hgYAH+CHsPVfpj3RgKNb8JaVlf1p9OjRqTabrclkMrGb3Y1AICD19fXWurq6SofD8Ruj6wlLTuswAL+CFsLzAQwztiDqhe/wQ9j+D8OWBjs9Z4JOs9lsjQzdnplMJmWz2dx1dXXTjK4lbDnd56H9Af8jnFYLgGxoveAsABONLI0uCAA4BGAPgEIAxQxbCid6Bq+JoXt5we+IQ/F60K7n/CC4AU7reGgBfFPwkSsm6acaWtDuBVAEp7vR4HqI+g2vfSTq5HSfAPBfwQ1wWlPxQxDPA3ClUaWFodPQQlbbnO5vDa6HSDeGBa993UcZfbm/47+bX3q5NrGxsektLS2HLtUmLy8v4ZFHHjkXFxcX6LvqLuZyuYYuWLBg0tGjR4/05+fQz+B0VwGoAvBa8FKldAA3BB/Toa2gxNtXXl4zgDIAXwS3/cHvligisccbIj8/f9T999/f8FOC1+/3IyqKX2VYc7oDAEqDW/A1azS0WfrpXTYHgFgDKhwovod2fvaLLlt18PsjIkRo8BYUFMTl5eWNGTFihM/lclnS0tJadu3adWzDhg0JZ8+eHZKZmZl85ZVX+vfv31/9/vvvx+fl5Y1pb2+X8ePHe7dv337carUGkpKS0pYtW3auqKgofuXKlWe3bt2akJGR0fzZZ5/Fezwe85YtW45nZ2c3u1yuoXfeeeeE1tZWEwC88sor39x8883njf4OqA843V5cHMYmACnQQvgX0CZs2QFMQPgMVSsAtQCOAfg6+FgBoBRO99dGFkY0GERk8AJAVVWV5fDhw1/b7XZfRkbG5MLCwuFPPfXU2ddff31UcXFxdWJior+2tjZqw4YNiSUlJdXx8fGBJ598cvQzzzwz6sUXX6wFgJiYmEBpaakLALZu3Zrg9/ulvLy8aseOHda8vLwx2dnZ1WPGjPHv27evOjY2VpWXl0cvW7bs6oqKCg6zhSutZ9c5RP37H79njYcWwPYeHuN0q/PyvocWqJ3b112eH4fT3WZgbUSDWsQGb1pa2vmJEyf6AGDq1KktNTU1F91e8NNPPx1WU1MTM3PmzMkA4PP5JCMjo7nz/bvvvvtHMy+XLFnSCADXX3/9+ccee2woALS3t8t99903vrKy0mIymXDixIno/jwuGsCc7iZo5zrLun/fOhxar/iKXmxWANHQZsCbg4+dzzu62XwA3NACtTFkC33te94Fiqj/RGzwRkdHX7i0yWw2w+/3S2gbpRRmz57d9OGHHx7rbh+h54FjYmIUAERFRaGjo0MA4Nlnnx2VkJDg27lz57FAIACLxdKnk8oojDjdzdAmIp00uhQi6j+8XjTEsGHDOtxutwkA5s2bd/7gwYPDKyoqogHA4/GYvvzyy5/UY3W73ebExESf2WzG5s2bR3Z0dPRH2URENEgY1uPtzeU/RsjJyTl32223TUpISPDt37+/Oj8///jSpUuvbm9vFwBYv379qenTp3t7u7/Vq1efXbRo0cRdu3ZdOXv2bI/FYuHsTiKiCKbnvZqPOxyOc7p82CBXVlZ2lcPhsBtdBxER9T0ONRMREemIwUtERKQjBi8REZGOGLxEREQ6YvASERHpiMFLRESkI+PuXOW09u0dnJzuXl0XvHbt2tE7d+4caTKZlMlkwubNm09kZWX1y6IFvVmGkKi/iMg9AFYBSAbgB3AcQJFS6lEDy/oREbFDu//zr5VSBcZWQ6SPiLpl5J49e4Z98sknV5SXl1daLBZVW1sb5fV6L7pV5E/h8/kwZAiXZKWBRUR+C+AZAM8DWAcgBkAGgLsADJjgJYpEETXUfOrUqSEjRozwWywWBQCJiYl+u93u27dvX+y1116bMnXq1NTZs2dPOnHixBAA2Lhx41XTpk1LTUlJmXLrrbdO9Hg8JgBYtGiRfcWKFWOvu+665Iceemis2+02LV682J6cnDwlOTl5yrZt267o/MxVq1YlpaSkTHE4HJNPnjwZUf/RIUP9C4B8pdQTSqlCpdSHSikngEl6fLiImEXkooVHiCjCgnfhwoVNp0+fHmq326fddddd4z766KPhXq9XcnNzx+3evbvmyJEjVTk5OefWrFmTBADLly9vrKioqHK5XJUpKSmtmzZtuqpzXzU1NTGff/559RtvvPHtunXrEuPj4zuqq6srq6urK+fPn+8BgNbWVtOsWbOaXS5X5axZs5pfffVVm1HHThHnCgB1oS+qkFvViUiMiDwvIidFxCsiZSLyq5A2d4vIZyLSICKNIlIkIjNC2mwTkYMislBEjgBoA3Bd8L3xIvLfInJORFpE5EsRuTOktFgRyRcRt4h8KyL/LiIR9feJIkdE9cCsVmugoqKi8uOPP47bu3dvXE5OzsRHH3309NGjRy1ZWVnJABAIBGCz2XwAUFpaann66aeTPB6P+fz58+bMzEx3575uv/32xqgo7esrKSmJ3759+4UFwG02WwcADBkyRC1dutQNABkZGef37NkTr+PhUmT7AsAqEfkGQIFS6rse2r0HYCaA9QBqAPwDgD+JyAyl1OFgGzuAt4PvDwVwJ4ASEZmmlOq68L0d2tB2HoAzAI6JSAKA/wXQAmANtJWXpgH4u5A6ngewE8BiADcBeBrAEQB/+FsOnmggi6jgBbQl+xYsWOBZsGCBZ/r06a1btmyxXXPNNa2HDx/+KrTtAw88MOG9997766xZs1o3bdo0sri4+MJC5cOHD7+w2IFSCiIXnyqOiopSJpPpwud2t/QgUT/5ZwC7AGwDoESkClqwvaiUagIAEbkJwHwA85RSxcHf+7OIJAN4EsASAFBK5XXuNNgLLQRwLbTzxRfeAzASwC+7BDZE5DloawdnKKVqgy/v7abeEqXUvwafF4pINoDbweClMBRRQzllZWXR5eXlF5b1O3TokGXSpEltDQ0NUXv27BkGAF6vVw4ePBgDAC0tLaZx48b5vF6vbN++fURP+503b17TSy+9lND5c319vbk/j4PocpRSXwJIBfAbAJsBCIB/A3BQRIYHm/0S2nD05yIS1blBC8YLQ8kikioiH4jIGQAdAHwAUqDNlu7qVNfQDcoC8HGX0O3Jn0N+rgQwtheHSjToGHg5Ue8u/+lLTU1N5tzc3HFNTU1ms9ms7Ha796233jpx7Nix+tzc3HEej8fc0dEhDz744JkZM2a0rVu37vTMmTNTk5KS2lNTU1uam5u7DdTnnnuu9t577x03adKkqSaTST3xxBOnc3Jyvtf7+Ii6Ukp5AXwY3CAi9wHYCuA+AK8AuArAaGhBGqoj+Dtx0ELxDLTZ0Cegnb/dCm2mdFdnutnPSAD/14tyQ/+9tHezf6KwwGUBByAuC0j9RUS+A/AHpdSDIvIfAJYDWNhdW6XUQRG5BcAnAFKVUl912c8xAKVKqcXBn7cBmKaUCp10tR/At0qpRT3UY0c31/H2tD+icBBRQ81EkSI4qSn0NRu0862dPdO90Hq8zUqpg6FbsI0l+Ojtsp/roU2k6o29AG4VkVF/w2EQhaWIm1xFFCHKRWQ3tGHiswDGQ5tV3ALgrWCbQmi92cJg7/cIgHgAvwAQo5T6LYC/AGgG8IaIPA/tvKsTwKle1vGfAO4GsE9EnoU2qzkVwDCl1PM/9yCJBiP2eInCUx60XukmaOH7DLRgnamUOgZcuKb3dgBvAlgNLYTzAcwC8FmwzRlos5tHA9gdbPdPAP7amyKUUvUAbgBwCMDLAAoAPADgm59/iESDE8/xDkA8x0tEFL7Y4yUiItIRg5eIiEhHhk2uSnsrrU+XBSzPKddlWcCSkpLYN998c+S2bdtO9tSmoKAgbuPGjaOKiop6dR6MiIgiR0TNau6LZQHnzp3bMnfu3Jb+qpGIiMJbRA0197Qs4O7du+NSU1OnJCcnT1myZIm9tbVVAKC4uDg2PT19ckpKypS0tLTUxsZGU0FBQdyNN954DQAUFRXFpqenT05NTZ2Snp4+uaysLPpSn09ERBRRwdvdsoAtLS2ycuXKCTt27Kiprq6u9Pv9eOGFF2xtbW2yfPnyiS+//PI3Lpersri42NV1YQQAcDgcbQcOHPiqqqqqcv369acef/xx3luWiIguKaKGmrtbFvDhhx+uHTt2rHf69OleALjnnnu+e+211xKys7M9CQkJvszMzBYAGDFiRCB0fw0NDeY77rhjwvHjx2NERPl8Pq4+RERElxRRwQtcvCzgO++8M7K7dsGl/i55kfPatWuTMjMzPYWFhTUul2toVlZWSv9UTURE4SKihpq7WxbQZrP5Tp06NbSioiIaAN5+++2Rc+bM8TgcjrYzZ84MLS4ujgWAxsZGk8/340VcmpqazGPHjm0HgPz8/Kt0PBQiIhqkDOvx9vbyn77U07KABw4caFiyZMnEjo4OOByOljVr1tTHxMSod999tyY3N3dcW1ubKSYmJlBSUlLddX9r166tW7FixYRNmzaNnjNnTpPex0NERIMPbxk5APGWkURE4SuihpqJiIiMxuAlIiLSkZ7BGwgEArzc5jKC39FFly4REVF40DN4K+rr660M354FAgGpr6+3AqgwuhYiIuofus1q9vv9K+rq6rbW1dVNA4e4exIAUOH3+1cYXQgREfUP3WY1ExEREXueREREumLwEhER6YjBS0REpCMGLxERkY4YvERERDpi8BIREemIwUtERKQjBi8REZGOGLxEREQ6YvASERHpiMFLRESkIwYvERGRjhi8REREOmLwEhER6ej/AdcpY62fEq5kAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "page_views_by_traffic_source.plot.pie(y = \"count\", labels = [\"Internal\", \"Search\", \"Social\"]\n",
    "                    , figsize = (8, 8), autopct = \"%.2f\", fontsize = 15)\n",
    "plt.title(\"Page views by traffic source\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df events: 9\n",
      "+----------+--------------+-----------+---------+--------+------------+\n",
      "|display_id|          uuid|document_id|timestamp|platform|geo_location|\n",
      "+----------+--------------+-----------+---------+--------+------------+\n",
      "|         1|cb8c55702adb93|     379743|       61|       3|   US>SC>519|\n",
      "|         2|79a85fa78311b9|    1794259|       81|       2|   US>CA>807|\n",
      "|         3|822932ce3d8757|    1179111|      182|       2|   US>MI>505|\n",
      "|         4|85281d0a49f7ac|    1777797|      234|       2|   US>WV>564|\n",
      "|         5|8d0daef4bf5b56|     252458|      338|       2|       SG>00|\n",
      "|         6|7765b4faae4ad4|    1773517|      395|       3|   US>OH>510|\n",
      "|         7|2cc3f6457d16da|    1149661|      602|       2|   US>MT>762|\n",
      "|         8|166fc654d73c98|    1330329|      638|       2|   US>PA>566|\n",
      "|         9|9dddccf70f6067|    1772126|      667|       1|   US>FL>528|\n",
      "|        10|b09a0e92aa4d17|     157455|      693|       1|          US|\n",
      "|        11|602e210c5831e5|    1773230|      710|       3|   US>IL>675|\n",
      "|        12|6fa993bd0e0157|     892701|      718|       1|   US>TX>612|\n",
      "|        13|7355615832b3af|    1778533|      739|       1|   US>AZ>753|\n",
      "|        14|daef797fc210a2|    1759953|      798|       3|   US>NC>560|\n",
      "|        15|24c64dc30891c0|    1777136|     1000|       2|          GB|\n",
      "|        16|30c0ad12b36375|    1727882|     1033|       1|   US>FL>561|\n",
      "|        17|c80c06d718ba65|    1667725|     1044|       2|   US>MI>540|\n",
      "|        18|eb58e66b4f6bb0|     429183|     1175|       2|   US>OR>820|\n",
      "|        19|c419799a427c72|    1155107|     1202|       3|   US>HI>744|\n",
      "|        20|650e3b5699738b|    1642724|     1282|       2|   US>MI>505|\n",
      "+----------+--------------+-----------+---------+--------+------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "events = load(\"events\")\n",
    "events.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "|display_id|          uuid|document_id|          timestamp|platform|geo_location|\n",
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "|         1|cb8c55702adb93|     379743|2016-06-14 09:30:00|       3|   US>SC>519|\n",
      "|         2|79a85fa78311b9|    1794259|2016-06-14 09:30:00|       2|   US>CA>807|\n",
      "|         3|822932ce3d8757|    1179111|2016-06-14 09:30:00|       2|   US>MI>505|\n",
      "|         4|85281d0a49f7ac|    1777797|2016-06-14 09:30:00|       2|   US>WV>564|\n",
      "|         5|8d0daef4bf5b56|     252458|2016-06-14 09:30:00|       2|       SG>00|\n",
      "|         6|7765b4faae4ad4|    1773517|2016-06-14 09:30:00|       3|   US>OH>510|\n",
      "|         7|2cc3f6457d16da|    1149661|2016-06-14 09:30:00|       2|   US>MT>762|\n",
      "|         8|166fc654d73c98|    1330329|2016-06-14 09:30:00|       2|   US>PA>566|\n",
      "|         9|9dddccf70f6067|    1772126|2016-06-14 09:30:00|       1|   US>FL>528|\n",
      "|        10|b09a0e92aa4d17|     157455|2016-06-14 09:30:00|       1|          US|\n",
      "|        11|602e210c5831e5|    1773230|2016-06-14 09:30:00|       3|   US>IL>675|\n",
      "|        12|6fa993bd0e0157|     892701|2016-06-14 09:30:00|       1|   US>TX>612|\n",
      "|        13|7355615832b3af|    1778533|2016-06-14 09:30:00|       1|   US>AZ>753|\n",
      "|        14|daef797fc210a2|    1759953|2016-06-14 09:30:00|       3|   US>NC>560|\n",
      "|        15|24c64dc30891c0|    1777136|2016-06-14 09:30:00|       2|          GB|\n",
      "|        16|30c0ad12b36375|    1727882|2016-06-14 09:30:01|       1|   US>FL>561|\n",
      "|        17|c80c06d718ba65|    1667725|2016-06-14 09:30:01|       2|   US>MI>540|\n",
      "|        18|eb58e66b4f6bb0|     429183|2016-06-14 09:30:01|       2|   US>OR>820|\n",
      "|        19|c419799a427c72|    1155107|2016-06-14 09:30:01|       3|   US>HI>744|\n",
      "|        20|650e3b5699738b|    1642724|2016-06-14 09:30:01|       2|   US>MI>505|\n",
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "events = events.withColumn(\"timestamp\", F.expr(\"from_unixtime(cast((timestamp + 1465876799998)/1000 as int))\"))\n",
    "events.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df events: 9\n"
     ]
    }
   ],
   "source": [
    "events = load(\"events\", rebase_timestamp=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23120126"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find distinct count of users, document, and location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 8 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Row(count(DISTINCT uuid)=19794967, count(DISTINCT document_id)=894060, count(DISTINCT geo_location)=2988)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time events.selectExpr(\"count(distinct uuid)\", \"count(distinct document_id)\", \"count(distinct geo_location)\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Is the display_id unique in the events dataset?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 4 ms, total: 4 ms\n",
      "Wall time: 4.64 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time events.groupBy(\"display_id\").count().filter(\"count>1\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Do all the display_id in the clicks_train present in events?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 4 ms, total: 4 ms\n",
      "Wall time: 5.68 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_train.select(\"display_id\").distinct().join(events, on = [\"display_id\"], how = \"left_anti\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, display_id for each record in clicks_train is present in events dataset.\n",
    "\n",
    "Check the same for clicks_test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
      "Wall time: 3.21 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time clicks_test.select(\"display_id\").distinct().join(events, on = [\"display_id\"], how = \"left_anti\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Average events by user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(avg_event_by_user=1.1679800223966021)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.selectExpr(\"count(*)/count(distinct uuid) avg_event_by_user\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does the timestamp exist for each record in events?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.filter(\"isnull(timestamp)\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does the timestamp in clicks_train and clicks_test have overlap?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+\n",
      "|summary|\n",
      "+-------+\n",
      "|  count|\n",
      "|   mean|\n",
      "| stddev|\n",
      "|    min|\n",
      "|    max|\n",
      "+-------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 18.6 ms\n"
     ]
    }
   ],
   "source": [
    "%time clicks_train.join(events, on = [\"display_id\"]).select(\"timestamp\").describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+\n",
      "|summary|\n",
      "+-------+\n",
      "|  count|\n",
      "|   mean|\n",
      "| stddev|\n",
      "|    min|\n",
      "|    max|\n",
      "+-------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 31.5 ms\n"
     ]
    }
   ],
   "source": [
    "%time clicks_test.join(events, on = [\"display_id\"]).select(\"timestamp\").describe().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Clearly, the date ranges of training and test data overlap."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Alignment between Events and Page Views"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of page views without matching event"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many events have matching page views by uuid and document_id"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many events have no page views"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many views records have not matching events "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A given user might visit the same page more than once. Show sample events for which multiple page_views exist.  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Show the distribution of number of ditinct users who view the same document multiple times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def join_views_and_events(columns):\n",
    "    df1 = page_views.select(*columns).withColumn(\"page_views\", F.lit(1)).withColumn(\"events\", F.lit(0))\n",
    "    df2 = events.select(*columns).withColumn(\"page_views\", F.lit(0)).withColumn(\"events\", F.lit(1))\n",
    "    df3 = df1.union(df2)\n",
    "    df4 = df3.groupBy(columns).agg(\n",
    "                F.sum(\"page_views\").alias(\"page_views_count\"), \n",
    "                F.sum(\"events\").alias(\"events_count\"))\n",
    "    return df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+-----------+----------------+------------+\n",
      "|          uuid|document_id|page_views_count|events_count|\n",
      "+--------------+-----------+----------------+------------+\n",
      "|3286c54a862512|     647776|               1|           0|\n",
      "|a29f99948e908e|     647776|               1|           0|\n",
      "|1dd38cfc32d0af|     647776|               1|           0|\n",
      "|2ebe105af88084|     647776|               1|           0|\n",
      "|fc4f1e271ab611|     647776|               1|           0|\n",
      "|7a14dda54e7f60|     649004|               1|           0|\n",
      "|7c04b579125457|     649161|               1|           0|\n",
      "|df1f16aaeff918|     649161|               1|           0|\n",
      "|7932260aef663f|     649395|               1|           0|\n",
      "|b78c5990704b55|     649395|               1|           0|\n",
      "|c9cc4498d587e5|     649395|               1|           0|\n",
      "|d0f57454f9bd8d|     649395|               1|           0|\n",
      "|514bdfac720462|     649395|               1|           0|\n",
      "|5e981a9aa49864|     649707|               1|           0|\n",
      "|7c685aa0deb6de|     649828|               1|           0|\n",
      "|3a2c69d0db5f65|     649828|               1|           0|\n",
      "|f9ab409a1b03df|     649828|               1|           0|\n",
      "|20aaf0adfef08f|     649871|               1|           0|\n",
      "|ce10272e09d625|     649871|               1|           0|\n",
      "|de9d57a2dc71c3|     650130|               1|           0|\n",
      "+--------------+-----------+----------------+------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Cache output to disk. The dataframe is too large to hold in the memory of the current machine \n",
    "views_and_event = join_views_and_events([\"uuid\", \"document_id\"]).persist(StorageLevel.DISK_ONLY)\n",
    "views_and_event.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many user-document combination does not have any click on ads?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 10.2 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "9877706"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time views_and_event.filter(\"page_views_count > 0 and events_count = 0\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many user-doc combination does have machine record in page views?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
      "Wall time: 367 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "22468988"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time views_and_event.filter(\"page_views_count = 0 and events_count > 0\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Considering events represents the page views that have got clicks, what fraction of page views got clicks?  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.312012831201283"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.count()/page_views.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's take a sample uuid and document_id to see whether the event records have a matching page_views record."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-----------+---------+--------+------------+--------------+\n",
      "|uuid|document_id|timestamp|platform|geo_location|traffic_source|\n",
      "+----+-----------+---------+--------+------------+--------------+\n",
      "+----+-----------+---------+--------+------------+--------------+\n",
      "\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 91.1 ms\n"
     ]
    }
   ],
   "source": [
    "%time page_views.filter(\"uuid = 'a34004004c3e50' and document_id = 140264\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "|display_id|          uuid|document_id|          timestamp|platform|geo_location|\n",
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "|      1341|a34004004c3e50|     140264|2016-06-14 09:31:33|       1|   US>CA>868|\n",
      "|  17427544|a34004004c3e50|     140264|2016-06-16 19:24:19|       1|   US>CA>868|\n",
      "+----------+--------------+-----------+-------------------+--------+------------+\n",
      "\n",
      "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
      "Wall time: 1.34 s\n"
     ]
    }
   ],
   "source": [
    "%time events.filter(\"uuid = 'a34004004c3e50' and document_id = 140264\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hypothesis: The page views record come web server logs. Events some user tracking devices such as omniture, google analytics. User may open the page, the after sometime, user may choose to view an ad. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "User may view a page more than once. Find how many users have viewed the same page more than once. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-----------+-----+\n",
      "|uuid|document_id|count|\n",
      "+----+-----------+-----+\n",
      "+----+-----------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "repeated_page_views = page_views.groupBy([\"uuid\", \"document_id\"]).count()\\\n",
    "        .filter(\"count > 1\").orderBy(F.desc(\"count\"))\n",
    "repeated_page_views.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "repeated_page_views.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Look at a sample uuid who has repeated visited a page to observe the pattern in the source, location and timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "if repeated_page_views.count()>0:\n",
    "    sample_record = repeated_page_views.sample(True, 0.1).take(1)[0]\n",
    "    page_views.filter(F.col(\"uuid\") == sample_record.uuid)\\\n",
    "        .filter(F.col(\"document_id\") == sample_record.document_id).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advertisement (Promopoted Content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df promoted_content: 4\n",
      "+-----+-----------+-----------+-------------+\n",
      "|ad_id|document_id|campaign_id|advertiser_id|\n",
      "+-----+-----------+-----------+-------------+\n",
      "|    1|       6614|          1|            7|\n",
      "|    2|     471467|          2|            7|\n",
      "|    3|       7692|          3|            7|\n",
      "|    4|     471471|          2|            7|\n",
      "|    5|     471472|          2|            7|\n",
      "|    6|      12736|          1|            7|\n",
      "|    7|      12808|          1|            7|\n",
      "|    8|     471477|          2|            7|\n",
      "|    9|      13379|          1|            7|\n",
      "|   10|      13885|          1|            7|\n",
      "|   11|      14230|          1|            7|\n",
      "|   12|     446701|         10|           19|\n",
      "|   13|     471499|         10|           19|\n",
      "|   14|     471500|         10|           19|\n",
      "|   15|     471501|         10|           19|\n",
      "|   16|     471514|         17|           19|\n",
      "|   17|     471517|         10|           19|\n",
      "|   18|     471518|         10|           19|\n",
      "|   19|     471519|          5|           19|\n",
      "|   20|     446660|         21|           19|\n",
      "+-----+-----------+-----------+-------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "promoted_contents = load(\"promoted_content\")\n",
    "promoted_contents.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "559583"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "promoted_contents.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Promoted_content stores the meta of the ads. Double check the ad_id is unique in this dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
      "Wall time: 135 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time promoted_contents.groupBy(\"ad_id\").count().filter(\"count>1\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many unique campaigns, documents and advertisers are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 214 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Row(count(DISTINCT document_id)=185709, count(DISTINCT campaign_id)=34675, count(DISTINCT advertiser_id)=4385)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time promoted_contents\\\n",
    ".selectExpr(\"count(distinct document_id)\", \"count(distinct campaign_id)\", \"count(distinct advertiser_id)\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does ad_id in the clicks dataset have meta data info in promoted_contents?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 4 ms, total: 4 ms\n",
      "Wall time: 3.71 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time (clicks_train.select(\"ad_id\").union(clicks_test.select(\"ad_id\"))\\\n",
    "       .join(promoted_contents, on = [\"ad_id\"], how = \"leftanti\").count())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, all the ad_id in clicks dataset exist in the promoted_contents. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find average CTR for campaign"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-------------------+\n",
      "|campaign_id|            avg_ctr|\n",
      "+-----------+-------------------+\n",
      "|      12020|0.18853094256899317|\n",
      "|      15551|0.14168937329700274|\n",
      "|      16636|0.06654206156723358|\n",
      "|      17774| 0.4740740740740741|\n",
      "|      10710|0.07173178014624783|\n",
      "|       7857|0.15448798511873105|\n",
      "|       8748|   0.13014925725663|\n",
      "|      28587|0.03608090199020851|\n",
      "|      26563|0.05487204724409449|\n",
      "|      25870|0.09421250207776988|\n",
      "|      12787|0.26873419406362303|\n",
      "|      24189|0.12057582351582147|\n",
      "|        343| 0.1642336045511124|\n",
      "|      22274|0.14866234594342462|\n",
      "|      27610|0.12141843236348225|\n",
      "|      25673| 0.0909153570443893|\n",
      "|        692|0.10861946684564693|\n",
      "|      17645|0.12021702251393619|\n",
      "|      23815|  0.085025248657077|\n",
      "|      26133| 0.1047878333543433|\n",
      "+-----------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "avg_ctr_by_campaign = promoted_contents.join(ctrs, on = \"ad_id\").groupBy(\"campaign_id\")\\\n",
    ".agg(F.avg(\"ctr\").alias(\"avg_ctr\"))\n",
    "\n",
    "avg_ctr_by_campaign.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find avg ctr by advertiser."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+-------------------+\n",
      "|advertiser_id|            avg_ctr|\n",
      "+-------------+-------------------+\n",
      "|         2349|0.10883058103462902|\n",
      "|         3829| 0.2068642691859514|\n",
      "|         1283|0.14176344970802937|\n",
      "|          434| 0.1043910399182465|\n",
      "|         2030|0.10217395554446719|\n",
      "|         2447|0.05872570281791586|\n",
      "|         2625|0.13082804384293295|\n",
      "|           38|0.13581588111063284|\n",
      "|         1633|0.20279203966409717|\n",
      "|         3404|0.03207982667240285|\n",
      "|         1809|0.15295881650970958|\n",
      "|         2260|0.35096740188296494|\n",
      "|          881| 0.1252447139156763|\n",
      "|         1557|0.09639892315222712|\n",
      "|         1830| 0.2122485949912904|\n",
      "|         2116| 0.1282686945248045|\n",
      "|         2840|0.23191125858194017|\n",
      "|         3572|0.12020942448693174|\n",
      "|          784|0.08288804563485187|\n",
      "|         2859|0.19682314282065358|\n",
      "+-------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "avg_ctr_by_advertiser = promoted_contents.join(ctrs, on = \"ad_id\").groupBy(\"advertiser_id\")\\\n",
    ".agg(F.avg(\"ctr\").alias(\"avg_ctr\"))\n",
    "\n",
    "avg_ctr_by_advertiser.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find avg ctr by document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+--------------------+\n",
      "|document_id|             avg_ctr|\n",
      "+-----------+--------------------+\n",
      "|    1087952| 0.33327873327873325|\n",
      "|     870875| 0.24400340797438524|\n",
      "|    1377696|0.048441767634379414|\n",
      "|    1146963| 0.19168084171412803|\n",
      "|    1083829| 0.32597524434580355|\n",
      "|     378777| 0.08661417322834646|\n",
      "|    1247211|  0.2608837247293875|\n",
      "|     902349|0.058823529411764705|\n",
      "|     907267| 0.03737655352703233|\n",
      "|    1273596| 0.26410511426215816|\n",
      "|    1333191|  0.1767042599220612|\n",
      "|     876083| 0.08421440203631031|\n",
      "|    1767220| 0.16518679146475518|\n",
      "|    1603476| 0.19174833310958966|\n",
      "|    1561030| 0.05284776441477035|\n",
      "|    1176283|  0.3650056262723164|\n",
      "|     630595|0.014084507042253521|\n",
      "|    1590983|                 0.0|\n",
      "|    1154277| 0.19185631204008377|\n",
      "|    1326394|  0.0379591665969375|\n",
      "+-----------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "avg_ctr_by_document = promoted_contents.join(ctrs, on = \"ad_id\").groupBy(\"document_id\")\\\n",
    ".agg(F.avg(\"ctr\").alias(\"avg_ctr\"))\n",
    "\n",
    "avg_ctr_by_document.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Document Attributes\n",
    "\n",
    "### Document Meta Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df documents_meta: 8\n",
      "+-----------+---------+------------+-------------------+\n",
      "|document_id|source_id|publisher_id|publish_time       |\n",
      "+-----------+---------+------------+-------------------+\n",
      "|788428     |579      |9           |2015-03-16 00:00:00|\n",
      "|864787     |579      |9           |2011-08-22 00:00:00|\n",
      "|817971     |579      |9           |2015-07-13 00:00:00|\n",
      "|806663     |579      |9           |2015-06-01 00:00:00|\n",
      "|796001     |579      |9           |2015-04-03 00:00:00|\n",
      "+-----------+---------+------------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_meta = load(\"documents_meta\")\n",
    "documents_meta.show(5, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2999334"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_meta.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify whether the document_id is unique in document_meta."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_meta.groupby(\"document_id\").count().filter(\"count>1\").count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many source_ids are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14395"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_meta.select(\"source_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many publisher_ids are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1260"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_meta.select(\"publisher_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Document Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df documents_categories: 8\n",
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- category_id: integer (nullable = true)\n",
      " |-- confidence_level: double (nullable = true)\n",
      "\n",
      "+-----------+-----------+----------------+\n",
      "|document_id|category_id|confidence_level|\n",
      "+-----------+-----------+----------------+\n",
      "|2416431    |1407       |0.87841876      |\n",
      "|2260393    |1702       |0.92            |\n",
      "|917692     |1702       |0.92            |\n",
      "|1786173    |1706       |0.07            |\n",
      "|1783554    |1709       |0.07            |\n",
      "+-----------+-----------+----------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_categories = load(\"documents_categories\").drop_duplicates([\"document_id\", \"category_id\"])\n",
    "documents_categories.printSchema()\n",
    "documents_categories.show(5, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5481474"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_categories.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_categories.select(\"category_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "category_id is not indexed. Using string indexer to index it. One Hot Encoding is another option. Here I am not doing it because the I want to use the confidence_level to weigh in. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----------+----------------+----+\n",
      "|document_id|category_id|confidence_level|  id|\n",
      "+-----------+-----------+----------------+----+\n",
      "|    2416431|       1407|      0.87841876|12.0|\n",
      "|    2260393|       1702|            0.92| 1.0|\n",
      "|     917692|       1702|            0.92| 1.0|\n",
      "|    1786173|       1706|            0.07|13.0|\n",
      "|    1783554|       1709|            0.07|55.0|\n",
      "|    1776902|       1806|     0.597394548|20.0|\n",
      "|    1784492|       1702|     0.556040824| 1.0|\n",
      "|    1812275|       1903|            0.92|15.0|\n",
      "|    1762888|       1613|     0.904981736|38.0|\n",
      "|    1811648|       1702|            0.92| 1.0|\n",
      "|    1110915|       1702|     0.622750116| 1.0|\n",
      "|     983795|       1702|     0.582905652| 1.0|\n",
      "|    1093318|       2004|            0.07| 7.0|\n",
      "|     984200|       1302|     0.436507432|48.0|\n",
      "|    1913424|       1706|            0.07|13.0|\n",
      "|    2061034|       1702|     0.042858662| 1.0|\n",
      "|    1972828|       1702|            0.92| 1.0|\n",
      "|    1904284|       1706|            0.07|13.0|\n",
      "|    1915415|       1403|     0.040102622| 0.0|\n",
      "|    1856631|       1702|     0.716022384| 1.0|\n",
      "+-----------+-----------+----------------+----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.feature import StringIndexer\n",
    "\n",
    "#String indexer required string column\n",
    "documents_categories = documents_categories.withColumn(\"category_id\", F.expr(\"cast(category_id as string)\")) \n",
    "\n",
    "if \"id\" in documents_categories.columns:\n",
    "    documents_categories = documents_categories.drop(\"id\")\n",
    "\n",
    "category_indexer = StringIndexer(inputCol=\"category_id\", outputCol=\"id\")\n",
    "documents_categories = category_indexer.fit(documents_categories).transform(documents_categories)\n",
    "\n",
    "documents_categories.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group the data by document_id and pack the other information in a array field."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_categories_n = documents_categories\\\n",
    ".withColumn(\"pair\", F.struct(\"id\", \"confidence_level\"))\\\n",
    ".groupBy(\"document_id\")\\\n",
    ".agg(F.collect_list(\"pair\").alias(\"categories\"))\n",
    "\n",
    "documents_categories_n.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2828649"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_categories_n.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----------------------------------------+\n",
      "|document_id|categories                               |\n",
      "+-----------+-----------------------------------------+\n",
      "|12         |[[0.0, 0.024439128], [15.0, 0.321199968]]|\n",
      "|18         |[[12.0, 0.92], [30.0, 0.07]]             |\n",
      "|38         |[[51.0, 0.92], [32.0, 0.07]]             |\n",
      "+-----------+-----------------------------------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_categories_n.show(3, False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Document entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df documents_entities: 8\n",
      "+-----------+--------------------------------+-----------------+\n",
      "|document_id|entity_id                       |confidence_level |\n",
      "+-----------+--------------------------------+-----------------+\n",
      "|1090777    |13382f1237391289a06215b1c3fd76b6|0.242424004409274|\n",
      "|1118903    |135629f3dea09fb3793708b55f49ed41|0.327493886222015|\n",
      "|871008     |00eb36948f331eada0a7306328e8cf9b|0.683590151896937|\n",
      "|2779469    |a1729c4252069e909f56b4c41283495b|0.248229877540157|\n",
      "|2126240    |8eff7a6a560c2fd02b0bf53db50ef3c5|0.204488577325427|\n",
      "+-----------+--------------------------------+-----------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_entities = load(\"documents_entities\").drop_duplicates([\"document_id\", \"entity_id\"])\n",
    "documents_entities.show(5, False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find stats around the number of entities per document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+\n",
      "|summary|             count|\n",
      "+-------+------------------+\n",
      "|  count|           1791420|\n",
      "|   mean| 3.091152270266046|\n",
      "| stddev|2.3904599699254128|\n",
      "|    min|                 1|\n",
      "|    max|                10|\n",
      "+-------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_entities.groupBy(\"document_id\").count().select(\"count\").describe().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find count of unique entity_ids."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1326009"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_entities.select(\"entity_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply StringIndex to index the entity_id."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+--------------------+-----------------+---------+\n",
      "|document_id|           entity_id| confidence_level|       id|\n",
      "+-----------+--------------------+-----------------+---------+\n",
      "|    1090777|13382f1237391289a...|0.242424004409274|   8166.0|\n",
      "|    1118903|135629f3dea09fb37...|0.327493886222015| 803292.0|\n",
      "|     871008|00eb36948f331eada...|0.683590151896937| 328978.0|\n",
      "|    2779469|a1729c4252069e909...|0.248229877540157|  97093.0|\n",
      "|    2126240|8eff7a6a560c2fd02...|0.204488577325427|     69.0|\n",
      "|    2193944|6a5157fd41504fb9c...|0.410463920637349| 318452.0|\n",
      "|    2978723|2e19d4df6d5c5d041...|0.633742282070604| 102664.0|\n",
      "|    1962013|a0650e1f56779ef9f...|0.817492792951132| 235150.0|\n",
      "|    1992396|e9690f32b711268ba...|0.338501150254482|  39067.0|\n",
      "|    1887868|11fa53caa77b3a322...|0.495300779508544| 263301.0|\n",
      "|    1887868|94c4db3ce2d1c5bb3...|0.248107088270607| 129942.0|\n",
      "|    1860593|7c50e96cb9213404d...|0.228940055714979| 356922.0|\n",
      "|    1936934|896ebce354e38fd15...|  0.8329473780528|      5.0|\n",
      "|    1936934|14f26ef84f0c9030c...|0.348393207151086|  26863.0|\n",
      "|    1962015|d9c3d349db59cd6c8...|0.721560136969932|  12858.0|\n",
      "|    1044282|c917e68c195155e4d...| 0.57021467052628|     78.0|\n",
      "|    1125936|90e29901a9ac6b5f2...|0.370813052702074| 189139.0|\n",
      "|    1086306|f69031a6ca419ab35...|0.870062855759672| 294076.0|\n",
      "|     975097|a5538a1f4abd62ca0...|0.215142725024088|1297045.0|\n",
      "|    1020830|e5119362abee65845...|0.789546225845562|   2183.0|\n",
      "+-----------+--------------------+-----------------+---------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "if \"id\" in documents_entities.columns:\n",
    "    documents_entities = documents_entities.drop(\"id\")\n",
    "\n",
    "documents_entities =  StringIndexer(inputCol=\"entity_id\", outputCol=\"id\")\\\n",
    ".fit(documents_entities)\\\n",
    ".transform(documents_entities)\n",
    "\n",
    "documents_entities.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_entities_n = documents_entities\\\n",
    ".withColumn(\"pair\", F.struct(\"id\", \"confidence_level\"))\\\n",
    ".groupBy(\"document_id\")\\\n",
    ".agg(F.collect_list(\"pair\").alias(\"entities\"))\n",
    "\n",
    "documents_entities_n.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Document topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df documents_topics: 8\n",
      "+-----------+--------+------------------+\n",
      "|document_id|topic_id|confidence_level  |\n",
      "+-----------+--------+------------------+\n",
      "|1452655    |196     |0.106394462699838 |\n",
      "|1320105    |247     |0.0571471264056742|\n",
      "|1320105    |147     |0.0276004057198086|\n",
      "|1328313    |235     |0.0152867401656747|\n",
      "|1408191    |125     |0.0448081967939227|\n",
      "+-----------+--------+------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_topics = load(\"documents_topics\").drop_duplicates([\"document_id\",\"topic_id\"])\n",
    "documents_topics.show(5, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299]\n"
     ]
    }
   ],
   "source": [
    "print(sorted([v.topic_id for v in documents_topics.select(\"topic_id\").distinct().collect()]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Topic_id seems already indexed. And there 300 topics are there."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "300"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_topics.select(\"topic_id\").distinct().count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "documents_topics_n = documents_topics\\\n",
    ".toDF(\"document_id\", \"id\", \"confidence_level\")\\\n",
    ".withColumn(\"pair\", F.struct(\"id\", \"confidence_level\"))\\\n",
    ".groupBy(\"document_id\")\\\n",
    ".agg(F.collect_list(\"pair\").alias(\"topics\"))\n",
    "\n",
    "documents_topics_n.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2495423"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents_topics_n.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a field new field for each dataset to indicate the original source file and then join all 4 datasets - categories, entities, topics and meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- source_id: integer (nullable = true)\n",
      " |-- publisher_id: integer (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "docs = documents_meta.join(documents_categories_n, on = \"document_id\", how = \"full\")\n",
    "docs = docs.join(documents_topics_n, on = \"document_id\", how = \"full\")\n",
    "docs = docs.join(documents_entities_n, on = \"document_id\", how = \"full\")\n",
    "docs.persist(StorageLevel.DISK_ONLY).count()\n",
    "docs.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>document_id</th>\n",
       "      <th>source_id</th>\n",
       "      <th>publisher_id</th>\n",
       "      <th>publish_time</th>\n",
       "      <th>categories</th>\n",
       "      <th>topics</th>\n",
       "      <th>entities</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12</td>\n",
       "      <td>650</td>\n",
       "      <td>680</td>\n",
       "      <td>2007-05-31 18:00:00</td>\n",
       "      <td>[(0.0, 0.024439128), (15.0, 0.321199968)]</td>\n",
       "      <td>[(108, 0.00946375424175878), (35, 0.0125694956...</td>\n",
       "      <td>[(410534.0, 0.796887026745963)]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>2186</td>\n",
       "      <td>184</td>\n",
       "      <td>2007-06-22 08:00:00</td>\n",
       "      <td>[(12.0, 0.92), (30.0, 0.07)]</td>\n",
       "      <td>[(244, 0.133812476391282), (75, 0.042843200875...</td>\n",
       "      <td>[(1195873.0, 0.241360234287563)]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>2186</td>\n",
       "      <td>184</td>\n",
       "      <td>2007-09-12 08:00:00</td>\n",
       "      <td>[(51.0, 0.92), (32.0, 0.07)]</td>\n",
       "      <td>[(147, 0.0111970150666592), (283, 0.0285988254...</td>\n",
       "      <td>[(766680.0, 0.321454645669068), (109171.0, 0.2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>70</td>\n",
       "      <td>2186</td>\n",
       "      <td>184</td>\n",
       "      <td>2007-10-11 00:00:00</td>\n",
       "      <td>[(32.0, 0.07), (51.0, 0.92)]</td>\n",
       "      <td>[(64, 0.0184155045959176), (244, 0.05646911220...</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>93</td>\n",
       "      <td>2186</td>\n",
       "      <td>184</td>\n",
       "      <td>2006-07-04 00:00:00</td>\n",
       "      <td>[(14.0, 0.645087164), (32.0, 0.049082719)]</td>\n",
       "      <td>[(247, 0.0385121909471879), (196, 0.0400245756...</td>\n",
       "      <td>[(153060.0, 0.938539922562667), (73158.0, 0.31...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   document_id  source_id  publisher_id        publish_time  \\\n",
       "0           12        650           680 2007-05-31 18:00:00   \n",
       "1           18       2186           184 2007-06-22 08:00:00   \n",
       "2           38       2186           184 2007-09-12 08:00:00   \n",
       "3           70       2186           184 2007-10-11 00:00:00   \n",
       "4           93       2186           184 2006-07-04 00:00:00   \n",
       "\n",
       "                                   categories  \\\n",
       "0   [(0.0, 0.024439128), (15.0, 0.321199968)]   \n",
       "1                [(12.0, 0.92), (30.0, 0.07)]   \n",
       "2                [(51.0, 0.92), (32.0, 0.07)]   \n",
       "3                [(32.0, 0.07), (51.0, 0.92)]   \n",
       "4  [(14.0, 0.645087164), (32.0, 0.049082719)]   \n",
       "\n",
       "                                              topics  \\\n",
       "0  [(108, 0.00946375424175878), (35, 0.0125694956...   \n",
       "1  [(244, 0.133812476391282), (75, 0.042843200875...   \n",
       "2  [(147, 0.0111970150666592), (283, 0.0285988254...   \n",
       "3  [(64, 0.0184155045959176), (244, 0.05646911220...   \n",
       "4  [(247, 0.0385121909471879), (196, 0.0400245756...   \n",
       "\n",
       "                                            entities  \n",
       "0                    [(410534.0, 0.796887026745963)]  \n",
       "1                   [(1195873.0, 0.241360234287563)]  \n",
       "2  [(766680.0, 0.321454645669068), (109171.0, 0.2...  \n",
       "3                                               None  \n",
       "4  [(153060.0, 0.938539922562667), (73158.0, 0.31...  "
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%show docs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find count of null values of each type of information - meta, category, entity and topic."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(null_topics=503911, null_categories=170685, null_entities=1207914, null_meta=64024)"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs.selectExpr(\"sum(if(isnull(topics), 1, 0)) null_topics\"\n",
    "              , \"sum(if(isnull(categories), 1, 0)) null_categories\"\n",
    "              , \"sum(if(isnull(entities), 1, 0)) null_entities\"\n",
    "              , \"sum(if(isnull(publisher_id), 1, 0)) null_meta\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vectorize and caculate weighted IDF scores"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many document topics are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "300"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs.select(F.explode(\"topics\")).select(\"col.id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many document categories are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs.select(F.explode(\"categories\")).select(\"col.id\").distinct().count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many document entities are there?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1326009"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs.select(F.explode(\"entities\")).select(\"col.id\").distinct().count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- source_id: integer (nullable = true)\n",
      " |-- publisher_id: integer (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics_vector: vector (nullable = true)\n",
      " |-- categories_vector: vector (nullable = true)\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2999334"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.ml.linalg import SparseVector, VectorUDT\n",
    "\n",
    "def to_vector(values, n):\n",
    "    if values is not None:\n",
    "        values = sorted(values, key=lambda v: v.id)\n",
    "        indices = [v.id for v in values]\n",
    "        values = [v.confidence_level for v in values]\n",
    "        return SparseVector(n, indices, values)\n",
    "    return SparseVector(n, [], [])\n",
    "\n",
    "spark.udf.register(\"to_vector\", to_vector, VectorUDT())\n",
    "\n",
    "docs_vectorized = docs\\\n",
    ".withColumn(\"topics_vector\", F.expr(\"to_vector(topics, 300)\"))\\\n",
    ".withColumn(\"categories_vector\", F.expr(\"to_vector(categories, 97)\"))\n",
    "#.withColumn(\"entities_vector\", F.expr(\"to_vector(entities, 1326009)\"))\n",
    "\n",
    "docs_vectorized.printSchema()\n",
    "docs_vectorized.cache().count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(topics_vector=SparseVector(300, {35: 0.0126, 108: 0.0095}))"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_vectorized.select(\"topics_vector\").first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2999334"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_vectorized.select(\"categories_vector\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- source_id: integer (nullable = true)\n",
      " |-- publisher_id: integer (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics_vector: vector (nullable = true)\n",
      " |-- categories_vector: vector (nullable = true)\n",
      " |-- topics_idf: vector (nullable = true)\n",
      " |-- categories_idf: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.feature import IDF, Tokenizer\n",
    "\n",
    "if \"topics_idf\" in docs.columns:\n",
    "    docs = docs.drop(\"topics_idf\")\n",
    "if \"entities_idf\" in docs.columns:\n",
    "    docs = docs.drop(\"entities_idf\")\n",
    "if \"categories_idf\" in docs.columns:\n",
    "    docs = docs.drop(\"categories_idf\")\n",
    "\n",
    "topics_idf = IDF(inputCol=\"topics_vector\", outputCol=\"topics_idf\")\n",
    "entities_idf = IDF(inputCol=\"entities_vector\", outputCol=\"entities_idf\")\n",
    "categories_idf = IDF(inputCol=\"categories_vector\", outputCol=\"categories_idf\")\n",
    "\n",
    "df1 = docs_vectorized\n",
    "df2 = topics_idf.fit(df1).transform(df1).cache()\n",
    "df3 = categories_idf.fit(df2).transform(df2).cache()\n",
    "#df4 = entities_idf.fit(df3).transform(df3).cache()\n",
    "\n",
    "docs_idf = df3\n",
    "docs_idf.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(document_id=12, topics_idf=SparseVector(300, {35: 0.0448, 108: 0.0402}), categories_idf=SparseVector(97, {0: 0.0405, 15: 1.1088}))"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_idf.select(\"document_id\", \"topics_idf\", \"categories_idf\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Generation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### User profile\n",
    "\n",
    "| Column  |  Description |\n",
    "|---|---|\n",
    "|user_has_already_viewed_doc|  For each content recommended to the user, verify whether the user had previously visited that pages.  \n",
    "|user_views_count | Do eager readers behave differently from other users? Let’s add this feature and let machine learning models guess that.\n",
    "|user_views_categories, user_views_topics, user_views_entities | User profile vectors based on categories, topics and entities of documents that users have previously viewed (weighted by confidence and TF-IDF), to model users preferences in a Content-Based Filtering approach\n",
    "|user_avg_views_of_distinct_docs | Ratio between (#user_distinct_docs_views / #user_views), indicating how often users read previously visited pages again. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uuid</th>\n",
       "      <th>document_id</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>user_has_already_viewed_doc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [uuid, document_id, timestamp, user_has_already_viewed_doc]\n",
       "Index: []"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_has_already_viewed_doc = (page_views\n",
    ".withColumn(\"user_has_already_viewed_doc\"\n",
    "            , F.expr(\"((ROW_NUMBER() OVER (PARTITION BY uuid, document_id ORDER BY timestamp))) > 1\"))\n",
    ".select(\"uuid\", \"document_id\", \"timestamp\", \"user_has_already_viewed_doc\")\n",
    "             )\n",
    "%show user_has_already_viewed_doc.filter(\"uuid = '6c4a7527da27d7' and document_id = 38922\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uuid</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>user_views_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [uuid, timestamp, user_views_count]\n",
       "Index: []"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_views_count = (page_views\n",
    "    .withColumn(\"user_views_count\", \n",
    "        F.expr(\"COUNT(1) OVER (PARTITION BY uuid ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    "    .select(\"uuid\", \"timestamp\", \"user_views_count\"))\n",
    "\n",
    "%show user_views_count.filter(\"uuid = '6c4a7527da27d7' and document_id = 38922\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#page_views = page_views.withColumn(\"user_avg_views_of_distinct_docs\", F.expr(\"COUNT(distinct document_id) \" + \n",
    "#                \"OVER (PARTITION BY uuid ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    "#\n",
    "#%show page_views.filter(\"uuid = '6c4a7527da27d7' and document_id = 38922\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Ads and Documents\n",
    "| Column  |  Description |\n",
    "|---|---|\n",
    "|doc_ad_days_since_published, doc_event_days_since_published | Days elapsed since the ad document was published in a given user visit. The general assumption is that new content is more relevant to users. But if you are reading an old post, you might be interested in other old posts.\n",
    "|doc_avg_views_by_distinct_users_cf | Average page views of the ad document by distinct users. Is this a webpage people usually return to?\n",
    "|ad_views_count, doc_views_count|How popular is a document or ad?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-----------+-------------------+-------------------+----+\n",
      "|display_id|document_id|          timestamp|       publish_time| age|\n",
      "+----------+-----------+-------------------+-------------------+----+\n",
      "|    152993|         12|2016-06-14 13:49:33|2007-05-31 18:00:00|3302|\n",
      "|    339733|         12|2016-06-14 18:03:45|2007-05-31 18:00:00|3302|\n",
      "|    368123|         12|2016-06-14 18:27:56|2007-05-31 18:00:00|3302|\n",
      "|    498757|         12|2016-06-14 20:12:53|2007-05-31 18:00:00|3302|\n",
      "|    532289|         12|2016-06-14 20:40:28|2007-05-31 18:00:00|3302|\n",
      "|    621186|         12|2016-06-14 21:51:17|2007-05-31 18:00:00|3302|\n",
      "|    690370|         12|2016-06-14 22:43:50|2007-05-31 18:00:00|3302|\n",
      "|    824952|         12|2016-06-15 00:22:16|2007-05-31 18:00:00|3303|\n",
      "|    835514|         12|2016-06-15 00:30:08|2007-05-31 18:00:00|3303|\n",
      "|    921140|         12|2016-06-15 01:34:24|2007-05-31 18:00:00|3303|\n",
      "|    939891|         12|2016-06-15 01:48:40|2007-05-31 18:00:00|3303|\n",
      "|    999400|         12|2016-06-15 02:33:34|2007-05-31 18:00:00|3303|\n",
      "|   1004350|         12|2016-06-15 02:37:30|2007-05-31 18:00:00|3303|\n",
      "|   1032310|         12|2016-06-15 02:59:47|2007-05-31 18:00:00|3303|\n",
      "|   1036260|         12|2016-06-15 03:03:03|2007-05-31 18:00:00|3303|\n",
      "|   1051266|         12|2016-06-15 03:15:14|2007-05-31 18:00:00|3303|\n",
      "|   1175371|         12|2016-06-15 05:08:16|2007-05-31 18:00:00|3303|\n",
      "|   1185129|         12|2016-06-15 05:17:43|2007-05-31 18:00:00|3303|\n",
      "|   1241190|         12|2016-06-15 06:12:39|2007-05-31 18:00:00|3303|\n",
      "|   1333437|         12|2016-06-15 07:39:53|2007-05-31 18:00:00|3303|\n",
      "+----------+-----------+-------------------+-------------------+----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "doc_event_days_since_published = (events\n",
    "    .join(documents_meta, on = \"document_id\")\n",
    "    .selectExpr(\"display_id\"\n",
    "                , \"document_id\"\n",
    "                , \"timestamp\"\n",
    "                , \"publish_time\"\n",
    "                , \"datediff(timestamp, publish_time) age\")\n",
    ")\n",
    "\n",
    "doc_event_days_since_published.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+------------------------------+\n",
      "|document_id|page_view_count_by_document_id|\n",
      "+-----------+------------------------------+\n",
      "|     654221|             3.258096538021482|\n",
      "|     661199|            2.0794415416798357|\n",
      "|     668458|             2.302585092994046|\n",
      "|     681321|            4.3694478524670215|\n",
      "|     697234|            2.3978952727983707|\n",
      "|     708354|            1.0986122886681098|\n",
      "|     715881|            1.0986122886681098|\n",
      "|     718954|             1.791759469228055|\n",
      "|     736505|             2.302585092994046|\n",
      "|     741481|             4.762173934797756|\n",
      "|     751011|            5.3706380281276624|\n",
      "|     765844|            2.1972245773362196|\n",
      "|     773914|             2.833213344056216|\n",
      "|     777173|             2.772588722239781|\n",
      "|     785600|              4.23410650459726|\n",
      "|     791242|             2.995732273553991|\n",
      "|     792002|            3.6375861597263857|\n",
      "|     793177|             3.332204510175204|\n",
      "|     796066|             4.709530201312334|\n",
      "|     796365|            1.0986122886681098|\n",
      "+-----------+------------------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "page_view_count_by_document_id = page_views.groupBy(\"document_id\")\\\n",
    ".count().withColumn(\"page_view_count_by_document_id\", F.expr(\"log(count)\"))\\\n",
    ".select(\"document_id\", \"page_view_count_by_document_id\")\n",
    "\n",
    "page_view_count_by_document_id.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+-----------------------+\n",
      "|          uuid|page_view_count_by_uuid|\n",
      "+--------------+-----------------------+\n",
      "|6ec2f1df169162|                    0.0|\n",
      "|682ff7de2c98c2|                    0.0|\n",
      "|bc12ea5e483fc0|                    0.0|\n",
      "|df0e90379a76d6|                    0.0|\n",
      "|3286c54a862512|     0.6931471805599453|\n",
      "|fa69933c24d064|                    0.0|\n",
      "|329a791d36dedb|                    0.0|\n",
      "|3cd95a6e6271a6|                    0.0|\n",
      "|ab642647e0361a|                    0.0|\n",
      "|7f6657ac2a45c1|                    0.0|\n",
      "|5c61bf0cedf30f|                    0.0|\n",
      "|c9b5daf2b577d0|                    0.0|\n",
      "|e870b628cf651b|                    0.0|\n",
      "|6be4e8057c3c71|                    0.0|\n",
      "|a4d1897a905a05|                    0.0|\n",
      "|27e48f2ec0626a|                    0.0|\n",
      "|6f3a61ca09e99a|                    0.0|\n",
      "|b5789a234c82a2|     0.6931471805599453|\n",
      "|5131b2d9a6c741|                    0.0|\n",
      "|d77ffd5f6727a9|                    0.0|\n",
      "+--------------+-----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "page_view_count_by_uuid = (page_views\n",
    ".groupBy(\"uuid\")\n",
    ".count()\n",
    ".withColumn(\"page_view_count_by_uuid\", F.expr(\"log(count)\"))\n",
    ".select(\"uuid\", \"page_view_count_by_uuid\"))\n",
    "\n",
    "page_view_count_by_uuid.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Events\n",
    "\n",
    "| Column  |  Description |\n",
    "|---|---|\n",
    "|event_local_hour (binned), event_weekend | Event timestamps were in UTC-4, so I processed event geolocation to get timezones and adjust for users' local time. They were binned in periods like morning, afternoon, midday, evening, night. A flag indicating whether it was a weekend was also included. The assumption here is that time influences the kind of content users will read.\n",
    "|event_country, event_country_state | The field event_geolocation was parsed to extract the user’s country and state in a page visit.\n",
    "|ad_id, doc_event_id, doc_ad_id, ad_advertiser, … | All of the original categorical fields were One-Hot Encoded to be used by the models, generating about 126,000 features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+------------+-------+-----+\n",
      "|          uuid|geo_location|country|state|\n",
      "+--------------+------------+-------+-----+\n",
      "|cb8c55702adb93|   US>SC>519|     US|   SC|\n",
      "|79a85fa78311b9|   US>CA>807|     US|   CA|\n",
      "|822932ce3d8757|   US>MI>505|     US|   MI|\n",
      "|85281d0a49f7ac|   US>WV>564|     US|   WV|\n",
      "|8d0daef4bf5b56|       SG>00|     SG|   00|\n",
      "|7765b4faae4ad4|   US>OH>510|     US|   OH|\n",
      "|2cc3f6457d16da|   US>MT>762|     US|   MT|\n",
      "|166fc654d73c98|   US>PA>566|     US|   PA|\n",
      "|9dddccf70f6067|   US>FL>528|     US|   FL|\n",
      "|b09a0e92aa4d17|          US|     US| null|\n",
      "|602e210c5831e5|   US>IL>675|     US|   IL|\n",
      "|6fa993bd0e0157|   US>TX>612|     US|   TX|\n",
      "|7355615832b3af|   US>AZ>753|     US|   AZ|\n",
      "|daef797fc210a2|   US>NC>560|     US|   NC|\n",
      "|24c64dc30891c0|          GB|     GB| null|\n",
      "|30c0ad12b36375|   US>FL>561|     US|   FL|\n",
      "|c80c06d718ba65|   US>MI>540|     US|   MI|\n",
      "|eb58e66b4f6bb0|   US>OR>820|     US|   OR|\n",
      "|c419799a427c72|   US>HI>744|     US|   HI|\n",
      "|650e3b5699738b|   US>MI>505|     US|   MI|\n",
      "+--------------+------------+-------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "events.selectExpr(\"uuid\", \"geo_location\"\n",
    "                  , \"split(geo_location, '>')[0] country\"\n",
    "                  , \"split(geo_location, '>')[1] state\"\n",
    "                 ).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DZ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BB</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>PH</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>MX</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>226</th>\n",
       "      <td>SE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227</th>\n",
       "      <td>PE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228</th>\n",
       "      <td>MK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>MA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>230</th>\n",
       "      <td>SN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>231 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    country\n",
       "0        DZ\n",
       "1        BB\n",
       "2        CA\n",
       "3        PH\n",
       "4        MX\n",
       "..      ...\n",
       "226      SE\n",
       "227      PE\n",
       "228      MK\n",
       "229      MA\n",
       "230      SN\n",
       "\n",
       "[231 rows x 1 columns]"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.selectExpr(\"split(geo_location, '>')[0] country\").distinct().toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-------------------+----+-----------+\n",
      "|display_id|          timestamp|hour|day_session|\n",
      "+----------+-------------------+----+-----------+\n",
      "|         1|2016-06-14 09:30:00|   9|          1|\n",
      "|         2|2016-06-14 09:30:00|   9|          1|\n",
      "|         3|2016-06-14 09:30:00|   9|          1|\n",
      "|         4|2016-06-14 09:30:00|   9|          1|\n",
      "|         5|2016-06-14 09:30:00|   9|          1|\n",
      "|         6|2016-06-14 09:30:00|   9|          1|\n",
      "|         7|2016-06-14 09:30:00|   9|          1|\n",
      "|         8|2016-06-14 09:30:00|   9|          1|\n",
      "|         9|2016-06-14 09:30:00|   9|          1|\n",
      "|        10|2016-06-14 09:30:00|   9|          1|\n",
      "|        11|2016-06-14 09:30:00|   9|          1|\n",
      "|        12|2016-06-14 09:30:00|   9|          1|\n",
      "|        13|2016-06-14 09:30:00|   9|          1|\n",
      "|        14|2016-06-14 09:30:00|   9|          1|\n",
      "|        15|2016-06-14 09:30:00|   9|          1|\n",
      "|        16|2016-06-14 09:30:01|   9|          1|\n",
      "|        17|2016-06-14 09:30:01|   9|          1|\n",
      "|        18|2016-06-14 09:30:01|   9|          1|\n",
      "|        19|2016-06-14 09:30:01|   9|          1|\n",
      "|        20|2016-06-14 09:30:01|   9|          1|\n",
      "+----------+-------------------+----+-----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "(events\n",
    " .selectExpr(\"display_id\", \"timestamp\", \"hour(timestamp) hour\")\n",
    " .withColumn(\"day_session\", F.expr(\"hour % 8\"))).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Top countries by ad clicks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------+\n",
      "|country|   count|\n",
      "+-------+--------+\n",
      "|     US|18595452|\n",
      "|     CA| 1215350|\n",
      "|     GB| 1117544|\n",
      "|     AU|  483021|\n",
      "|     IN|  228461|\n",
      "|     ZA|  111523|\n",
      "|     NZ|  109802|\n",
      "|     PH|   85338|\n",
      "|     DE|   82384|\n",
      "|     SG|   81975|\n",
      "|     MY|   53398|\n",
      "|     NL|   51209|\n",
      "|     NG|   41946|\n",
      "|     IE|   39505|\n",
      "|     SE|   38931|\n",
      "|     FR|   38755|\n",
      "|     MX|   35044|\n",
      "|     IT|   28727|\n",
      "|     KE|   26607|\n",
      "|     JP|   25955|\n",
      "+-------+--------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "events.selectExpr(\"split(geo_location, '>')[0] country\")\\\n",
    ".groupBy(\"country\").count().orderBy(F.desc(\"count\")).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Average CTR\n",
    "\n",
    "| Column  |  Description |\n",
    "|---|---|\n",
    "|avg_ctr_ad_id, avg_ctr_publisher_id, avg_ctr_advertiser_id, avg_ctr_campain_id, avg_ctr_entity_id_country … | Average CTR (#clicks / #views) given some categorical combinations and CTR confidence (details on Part II post). Eg. P(click  category01, category02)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[ad_id: int, document_id: int, campaign_id: int, advertiser_id: int]"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.cache()\n",
    "clicks_train.cache()\n",
    "documents_meta.cache()\n",
    "promoted_contents.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ad_id</th>\n",
       "      <th>avg_ctr_by_ad_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>71547</td>\n",
       "      <td>0.080267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>152141</td>\n",
       "      <td>0.163381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>35982</td>\n",
       "      <td>0.319444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>220315</td>\n",
       "      <td>0.081429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28347</td>\n",
       "      <td>0.219754</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ad_id  avg_ctr_by_ad_id\n",
       "0   71547          0.080267\n",
       "1  152141          0.163381\n",
       "2   35982          0.319444\n",
       "3  220315          0.081429\n",
       "4   28347          0.219754"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avg_ctrs_by_ad_id = clicks_train.groupBy(\"ad_id\").agg(F.avg(\"clicked\").alias(\"avg_ctr_by_ad_id\"))\n",
    "%show avg_ctrs_by_ad_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>campaign_id</th>\n",
       "      <th>avg_ctr_by_campaign_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16636</td>\n",
       "      <td>0.258839</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22411</td>\n",
       "      <td>0.133272</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>18034</td>\n",
       "      <td>0.101874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1384</td>\n",
       "      <td>0.315013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>25151</td>\n",
       "      <td>0.035303</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   campaign_id  avg_ctr_by_campaign_id\n",
       "0        16636                0.258839\n",
       "1        22411                0.133272\n",
       "2        18034                0.101874\n",
       "3         1384                0.315013\n",
       "4        25151                0.035303"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avg_ctrs_by_campaign_id = (clicks_train\n",
    ".join(promoted_contents, on = \"ad_id\")\n",
    ".groupBy(\"campaign_id\")\n",
    ".agg(F.avg(\"clicked\").alias(\"avg_ctr_by_campaign_id\")))\n",
    "\n",
    "%show avg_ctrs_by_campaign_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>advertiser_id</th>\n",
       "      <th>avg_ctr_by_advertiser_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2030</td>\n",
       "      <td>0.318668</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2447</td>\n",
       "      <td>0.096403</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2589</td>\n",
       "      <td>0.116398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1521</td>\n",
       "      <td>0.157812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>784</td>\n",
       "      <td>0.247658</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   advertiser_id  avg_ctr_by_advertiser_id\n",
       "0           2030                  0.318668\n",
       "1           2447                  0.096403\n",
       "2           2589                  0.116398\n",
       "3           1521                  0.157812\n",
       "4            784                  0.247658"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avg_ctrs_by_advertiser_id = (clicks_train\n",
    ".join(promoted_contents, on = \"ad_id\")\n",
    ".groupBy(\"advertiser_id\")\n",
    ".agg(F.avg(\"clicked\").alias(\"avg_ctr_by_advertiser_id\"))\n",
    ".cache()\n",
    ")\n",
    "\n",
    "%show avg_ctrs_by_advertiser_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>document_id</th>\n",
       "      <th>avg_ctr_by_document_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>876083</td>\n",
       "      <td>0.163142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1183136</td>\n",
       "      <td>0.101874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1260980</td>\n",
       "      <td>0.402718</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1377696</td>\n",
       "      <td>0.064897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1108162</td>\n",
       "      <td>0.129640</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   document_id  avg_ctr_by_document_id\n",
       "0       876083                0.163142\n",
       "1      1183136                0.101874\n",
       "2      1260980                0.402718\n",
       "3      1377696                0.064897\n",
       "4      1108162                0.129640"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avg_ctrs_by_document_id = (clicks_train\n",
    ".join(promoted_contents, on = \"ad_id\")\n",
    ".groupBy(\"document_id\")\n",
    ".agg(F.avg(\"clicked\").alias(\"avg_ctr_by_document_id\"))\n",
    ".cache()\n",
    ")\n",
    "\n",
    "%show avg_ctrs_by_document_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>display_id</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>avg_ctr_by_advertiser_id</th>\n",
       "      <th>avg_ctr_by_campaign_id</th>\n",
       "      <th>avg_ctr_by_ad_id</th>\n",
       "      <th>avg_ctr_by_publisher_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>2016-06-14 09:30:01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>19</td>\n",
       "      <td>2016-06-14 09:30:01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>566</td>\n",
       "      <td>2016-06-14 09:30:42</td>\n",
       "      <td>0.307692</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>566</td>\n",
       "      <td>2016-06-14 09:30:42</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>566</td>\n",
       "      <td>2016-06-14 09:30:42</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   display_id           timestamp  avg_ctr_by_advertiser_id  \\\n",
       "0          19 2016-06-14 09:30:01                       NaN   \n",
       "1          19 2016-06-14 09:30:01                       NaN   \n",
       "2         566 2016-06-14 09:30:42                  0.307692   \n",
       "3         566 2016-06-14 09:30:42                  0.444444   \n",
       "4         566 2016-06-14 09:30:42                  0.500000   \n",
       "\n",
       "   avg_ctr_by_campaign_id  avg_ctr_by_ad_id  avg_ctr_by_publisher_id  \n",
       "0                     NaN               NaN                      NaN  \n",
       "1                     NaN               NaN                 0.000000  \n",
       "2                     0.5               0.5                 0.500000  \n",
       "3                     NaN               NaN                 0.333333  \n",
       "4                     NaN               NaN                 0.250000  "
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avg_ctrs_by_time = (events\n",
    " .join(documents_meta, on = \"document_id\", how = \"left\")\n",
    " .join(clicks_train, on = \"display_id\", how = \"left\")\n",
    " .join(promoted_contents, on = \"ad_id\", how = \"left\")\n",
    " .withColumn(\"total_clicks_by_ad_id\"\n",
    "    , F.expr(\"SUM(clicked) OVER (PARTITION BY ad_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    " .withColumn(\"total_events_by_ad_id\"\n",
    "    , F.expr(\"COUNT(*) OVER (PARTITION BY ad_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    "\n",
    " .withColumn(\"total_clicks_by_advertiser_id\"\n",
    "    , F.expr(\"SUM(clicked) OVER (PARTITION BY advertiser_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    " .withColumn(\"total_events_by_advertiser_id\"\n",
    "    , F.expr(\"COUNT(*) OVER (PARTITION BY advertiser_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    "\n",
    " .withColumn(\"total_clicks_by_campaign_id\"\n",
    "    , F.expr(\"SUM(clicked) OVER (PARTITION BY campaign_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    " .withColumn(\"total_events_by_campaign_id\"\n",
    "    , F.expr(\"COUNT(*) OVER (PARTITION BY campaign_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    "\n",
    " .withColumn(\"total_clicks_by_publisher_id\"\n",
    "    , F.expr(\"SUM(clicked) OVER (PARTITION BY publisher_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    " .withColumn(\"total_events_by_publisher_id\"\n",
    "    , F.expr(\"COUNT(*) OVER (PARTITION BY publisher_id ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)\"))\n",
    " .selectExpr(\"display_id\"\n",
    "             , \"timestamp\"\n",
    "             , \"(total_clicks_by_advertiser_id/total_events_by_advertiser_id) avg_ctr_by_advertiser_id\"\n",
    "             , \"(total_clicks_by_campaign_id/total_events_by_campaign_id) avg_ctr_by_campaign_id\"\n",
    "             , \"(total_clicks_by_ad_id/total_events_by_ad_id) avg_ctr_by_ad_id\"\n",
    "             , \"(total_clicks_by_publisher_id/total_events_by_publisher_id) avg_ctr_by_publisher_id\")\n",
    ")\n",
    "\n",
    "%show avg_ctrs_by_time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Content-Based Similarities\n",
    "\n",
    "| Column  |  Description |\n",
    "|---|---|\n",
    "|user_doc_ad_sim_categories, user_doc_ad_sim_topics, user_doc_ad_sim_entities | Cosine similarity between user profile and ad document profile vectors (TF-IDF).\n",
    "|doc_event_doc_ad_sim_categories, doc_event_doc_ad_sim_topics, doc_event_doc_ad_sim_entities | Cosine similarity between event document (landing page context) and ad document profile vectors (TF-IDF)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare training and test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- source_id: integer (nullable = true)\n",
      " |-- publisher_id: integer (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "docs.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- source_id: integer (nullable = true)\n",
      " |-- publisher_id: integer (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics_vector: vector (nullable = true)\n",
      " |-- categories_vector: vector (nullable = true)\n",
      " |-- topics_idf: vector (nullable = true)\n",
      " |-- categories_idf: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "docs_idf.drop(\"document_id\").printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- uuid: string (nullable = true)\n",
      " |-- ad_id: integer (nullable = true)\n",
      " |-- display_id: integer (nullable = true)\n",
      " |-- clicked: integer (nullable = true)\n",
      " |-- is_train: integer (nullable = false)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- platform: string (nullable = true)\n",
      " |-- geo_location: string (nullable = true)\n",
      " |-- source_id: string (nullable = true)\n",
      " |-- publisher_id: string (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics_vector: vector (nullable = true)\n",
      " |-- categories_vector: vector (nullable = true)\n",
      " |-- topics_idf: vector (nullable = true)\n",
      " |-- categories_idf: vector (nullable = true)\n",
      " |-- campaign_id: integer (nullable = true)\n",
      " |-- advertiser_id: integer (nullable = true)\n",
      " |-- page_view_count_by_uuid: double (nullable = true)\n",
      " |-- page_view_count_by_document_id: double (nullable = true)\n",
      " |-- clicks_by_ad_id: long (nullable = true)\n",
      " |-- events_by_ad_id: long (nullable = false)\n",
      " |-- avg_ctr_by_ad_id: double (nullable = true)\n",
      " |-- clicks_by_campaign_id: long (nullable = true)\n",
      " |-- events_by_campaign_id: long (nullable = false)\n",
      " |-- avg_ctr_by_campaign_id: double (nullable = true)\n",
      " |-- clicks_by_document_id: long (nullable = true)\n",
      " |-- events_by_document_id: long (nullable = false)\n",
      " |-- avg_ctr_by_document_id: double (nullable = true)\n",
      " |-- clicks_by_advertiser_id: long (nullable = true)\n",
      " |-- events_by_advertiser_id: long (nullable = false)\n",
      " |-- avg_ctr_by_advertiser_id: double (nullable = true)\n",
      " |-- country: string (nullable = true)\n",
      " |-- state: string (nullable = true)\n",
      " |-- doc_age: integer (nullable = true)\n",
      " |-- session: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clicks = (clicks_train.withColumn(\"is_train\", F.lit(1))\n",
    "            .union(clicks_test\n",
    "                   .withColumn(\"clicked\", F.lit(0))\n",
    "                   .withColumn(\"is_train\", F.lit(0))))\n",
    "\n",
    "df = (clicks\n",
    "            .join(events.alias(\"events\"), on = [\"display_id\"], how = \"left\")\n",
    "            .join(docs_idf.alias(\"docs_idf\"), on = [\"document_id\"], how = \"left\")\n",
    "            .join(promoted_contents.drop(\"document_id\"), on = [\"ad_id\"], how = \"left\")\n",
    "            .join(page_view_count_by_uuid, on = [\"uuid\"], how = \"left\")\n",
    "            .join(page_view_count_by_document_id, on = [\"document_id\"], how = \"left\")\n",
    "      \n",
    "            .withColumn(\"clicks_by_ad_id\", F.expr(\"sum(clicked) over (partition by ad_id)\"))\n",
    "            .withColumn(\"events_by_ad_id\", F.expr(\"count(*) over (partition by ad_id)\"))\n",
    "            .withColumn(\"avg_ctr_by_ad_id\", F.expr(\"clicks_by_ad_id/events_by_ad_id\"))\n",
    "\n",
    "            .withColumn(\"clicks_by_campaign_id\", F.expr(\"sum(clicked) over (partition by campaign_id)\"))\n",
    "            .withColumn(\"events_by_campaign_id\", F.expr(\"count(*) over (partition by campaign_id)\"))\n",
    "            .withColumn(\"avg_ctr_by_campaign_id\", F.expr(\"clicks_by_campaign_id/events_by_campaign_id\"))\n",
    "            \n",
    "            .withColumn(\"clicks_by_document_id\", F.expr(\"sum(clicked) over (partition by events.document_id)\"))\n",
    "            .withColumn(\"events_by_document_id\", F.expr(\"count(*) over (partition by events.document_id)\"))\n",
    "            .withColumn(\"avg_ctr_by_document_id\", F.expr(\"clicks_by_campaign_id/events_by_document_id\"))\n",
    "            \n",
    "            .withColumn(\"clicks_by_advertiser_id\", F.expr(\"sum(clicked) over (partition by advertiser_id)\"))\n",
    "            .withColumn(\"events_by_advertiser_id\", F.expr(\"count(*) over (partition by advertiser_id)\"))\n",
    "            .withColumn(\"avg_ctr_by_advertiser_id\", F.expr(\"clicks_by_campaign_id/events_by_advertiser_id\"))\n",
    "            \n",
    "            .withColumn(\"country\", F.expr(\"split(geo_location, '>')[0]\"))\n",
    "            .withColumn(\"state\", F.expr(\"split(geo_location, '>')[1]\"))\n",
    "            .withColumn(\"doc_age\", F.expr(\"datediff(timestamp, publish_time)\"))\n",
    "            .withColumn(\"session\", F.expr(\"cast((hour(timestamp) % 8) as string)\"))\n",
    "            .withColumn(\"source_id\", F.expr(\"cast(source_id as string)\"))\n",
    "            .withColumn(\"publisher_id\", F.expr(\"cast(publisher_id as string)\"))\n",
    "           )\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: integer (nullable = true)\n",
      " |-- uuid: string (nullable = true)\n",
      " |-- ad_id: integer (nullable = true)\n",
      " |-- display_id: integer (nullable = true)\n",
      " |-- clicked: integer (nullable = true)\n",
      " |-- is_train: integer (nullable = false)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- platform: string (nullable = true)\n",
      " |-- geo_location: string (nullable = true)\n",
      " |-- source_id: string (nullable = true)\n",
      " |-- publisher_id: string (nullable = true)\n",
      " |-- publish_time: timestamp (nullable = true)\n",
      " |-- categories: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: integer (nullable = true)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- entities: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- id: double (nullable = false)\n",
      " |    |    |-- confidence_level: double (nullable = true)\n",
      " |-- topics_vector: vector (nullable = true)\n",
      " |-- categories_vector: vector (nullable = true)\n",
      " |-- topics_idf: vector (nullable = true)\n",
      " |-- categories_idf: vector (nullable = true)\n",
      " |-- campaign_id: integer (nullable = true)\n",
      " |-- advertiser_id: integer (nullable = true)\n",
      " |-- page_view_count_by_uuid: double (nullable = true)\n",
      " |-- page_view_count_by_document_id: double (nullable = true)\n",
      " |-- clicks_by_ad_id: long (nullable = true)\n",
      " |-- events_by_ad_id: long (nullable = false)\n",
      " |-- avg_ctr_by_ad_id: double (nullable = true)\n",
      " |-- clicks_by_campaign_id: long (nullable = true)\n",
      " |-- events_by_campaign_id: long (nullable = false)\n",
      " |-- avg_ctr_by_campaign_id: double (nullable = true)\n",
      " |-- clicks_by_document_id: long (nullable = true)\n",
      " |-- events_by_document_id: long (nullable = false)\n",
      " |-- avg_ctr_by_document_id: double (nullable = true)\n",
      " |-- clicks_by_advertiser_id: long (nullable = true)\n",
      " |-- events_by_advertiser_id: long (nullable = false)\n",
      " |-- avg_ctr_by_advertiser_id: double (nullable = true)\n",
      " |-- country: string (nullable = true)\n",
      " |-- state: string (nullable = true)\n",
      " |-- doc_age: integer (nullable = true)\n",
      " |-- session: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.write.mode(\"overwrite\").save(base_path + \"merged_enriched\")\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of partitions for df merged_enriched: 96\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "119366893"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = load(\"merged_enriched\", cache = False)\n",
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = [\n",
    "   'platform'\n",
    " , 'source_id'\n",
    " , 'publisher_id'\n",
    " , 'topics_idf'\n",
    " , 'categories_idf'\n",
    " , 'avg_ctr_by_ad_id'\n",
    " , 'avg_ctr_by_campaign_id'\n",
    " , 'avg_ctr_by_document_id'\n",
    " , 'avg_ctr_by_advertiser_id'\n",
    " , \"country\"\n",
    " , \"state\"\n",
    " , \"doc_age\"\n",
    " , \"session\"\n",
    " , \"ad_id\"\n",
    " , \"display_id\"\n",
    " , \"is_train\"\n",
    " , \"clicked\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- platform: string (nullable = true)\n",
      " |-- source_id: string (nullable = true)\n",
      " |-- publisher_id: string (nullable = true)\n",
      " |-- topics_idf: vector (nullable = true)\n",
      " |-- categories_idf: vector (nullable = true)\n",
      " |-- avg_ctr_by_ad_id: double (nullable = true)\n",
      " |-- avg_ctr_by_campaign_id: double (nullable = true)\n",
      " |-- avg_ctr_by_document_id: double (nullable = true)\n",
      " |-- avg_ctr_by_advertiser_id: double (nullable = true)\n",
      " |-- country: string (nullable = true)\n",
      " |-- state: string (nullable = true)\n",
      " |-- doc_age: integer (nullable = true)\n",
      " |-- session: string (nullable = true)\n",
      " |-- ad_id: integer (nullable = true)\n",
      " |-- display_id: integer (nullable = true)\n",
      " |-- is_train: integer (nullable = true)\n",
      " |-- clicked: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.selectExpr(*features).printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>document_id</th>\n",
       "      <th>uuid</th>\n",
       "      <th>ad_id</th>\n",
       "      <th>display_id</th>\n",
       "      <th>clicked</th>\n",
       "      <th>is_train</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>platform</th>\n",
       "      <th>geo_location</th>\n",
       "      <th>source_id</th>\n",
       "      <th>publisher_id</th>\n",
       "      <th>publish_time</th>\n",
       "      <th>categories</th>\n",
       "      <th>topics</th>\n",
       "      <th>entities</th>\n",
       "      <th>topics_vector</th>\n",
       "      <th>categories_vector</th>\n",
       "      <th>topics_idf</th>\n",
       "      <th>categories_idf</th>\n",
       "      <th>campaign_id</th>\n",
       "      <th>advertiser_id</th>\n",
       "      <th>page_view_count_by_uuid</th>\n",
       "      <th>page_view_count_by_document_id</th>\n",
       "      <th>clicks_by_ad_id</th>\n",
       "      <th>events_by_ad_id</th>\n",
       "      <th>avg_ctr_by_ad_id</th>\n",
       "      <th>clicks_by_campaign_id</th>\n",
       "      <th>events_by_campaign_id</th>\n",
       "      <th>avg_ctr_by_campaign_id</th>\n",
       "      <th>clicks_by_document_id</th>\n",
       "      <th>events_by_document_id</th>\n",
       "      <th>avg_ctr_by_document_id</th>\n",
       "      <th>clicks_by_advertiser_id</th>\n",
       "      <th>events_by_advertiser_id</th>\n",
       "      <th>avg_ctr_by_advertiser_id</th>\n",
       "      <th>country</th>\n",
       "      <th>state</th>\n",
       "      <th>doc_age</th>\n",
       "      <th>session</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5540</td>\n",
       "      <td>bf67d31751a2b4</td>\n",
       "      <td>106789</td>\n",
       "      <td>1993509</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2016-06-15 21:43:52</td>\n",
       "      <td>1</td>\n",
       "      <td>US&gt;UT&gt;770</td>\n",
       "      <td>121</td>\n",
       "      <td>655</td>\n",
       "      <td>2010-05-19 04:00:00</td>\n",
       "      <td>[(23.0, 0.069408514), (5.0, 0.912226184)]</td>\n",
       "      <td>[(143, 0.0677015377172056), (92, 0.01556335560...</td>\n",
       "      <td>None</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.912226184, 0.0, 0....</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 2.415922540132727, 0...</td>\n",
       "      <td>7615</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>18525</td>\n",
       "      <td>82844</td>\n",
       "      <td>0.223613</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>2</td>\n",
       "      <td>16</td>\n",
       "      <td>1742.250000</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>US</td>\n",
       "      <td>UT</td>\n",
       "      <td>2219.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1683252</td>\n",
       "      <td>7e86be74917d24</td>\n",
       "      <td>106789</td>\n",
       "      <td>21859752</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2016-06-28 17:07:29</td>\n",
       "      <td>1</td>\n",
       "      <td>US&gt;MD&gt;512</td>\n",
       "      <td>7736</td>\n",
       "      <td>450</td>\n",
       "      <td>2016-06-08 19:00:00</td>\n",
       "      <td>[(41.0, 0.92), (1.0, 0.07)]</td>\n",
       "      <td>[(85, 0.0264917522989691), (20, 0.089020019477...</td>\n",
       "      <td>None</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.1395557773478285, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>7615</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>18525</td>\n",
       "      <td>82844</td>\n",
       "      <td>0.223613</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>3617</td>\n",
       "      <td>25551</td>\n",
       "      <td>1.090994</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>US</td>\n",
       "      <td>MD</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13319</td>\n",
       "      <td>55cbcc88990fd2</td>\n",
       "      <td>56860</td>\n",
       "      <td>10356183</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2016-06-22 09:10:14</td>\n",
       "      <td>1</td>\n",
       "      <td>US&gt;GA&gt;524</td>\n",
       "      <td>31</td>\n",
       "      <td>535</td>\n",
       "      <td>NaT</td>\n",
       "      <td>None</td>\n",
       "      <td>[(271, 0.0346064878342803)]</td>\n",
       "      <td>[(131.0, 0.559219447002035)]</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>7615</td>\n",
       "      <td>12</td>\n",
       "      <td>0.0</td>\n",
       "      <td>None</td>\n",
       "      <td>1294</td>\n",
       "      <td>6777</td>\n",
       "      <td>0.190940</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>284.448980</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>US</td>\n",
       "      <td>GA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1683252</td>\n",
       "      <td>f88f087861ca26</td>\n",
       "      <td>106789</td>\n",
       "      <td>22094475</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2016-06-28 20:05:54</td>\n",
       "      <td>1</td>\n",
       "      <td>US&gt;MI&gt;505</td>\n",
       "      <td>7736</td>\n",
       "      <td>450</td>\n",
       "      <td>2016-06-08 19:00:00</td>\n",
       "      <td>[(41.0, 0.92), (1.0, 0.07)]</td>\n",
       "      <td>[(85, 0.0264917522989691), (20, 0.089020019477...</td>\n",
       "      <td>None</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.1395557773478285, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>7615</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>18525</td>\n",
       "      <td>82844</td>\n",
       "      <td>0.223613</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>3617</td>\n",
       "      <td>25551</td>\n",
       "      <td>1.090994</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>US</td>\n",
       "      <td>MI</td>\n",
       "      <td>20.0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>30671</td>\n",
       "      <td>1661a9463dfa61</td>\n",
       "      <td>106789</td>\n",
       "      <td>909524</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2016-06-15 01:25:40</td>\n",
       "      <td>1</td>\n",
       "      <td>US&gt;NY&gt;514</td>\n",
       "      <td>2609</td>\n",
       "      <td>3</td>\n",
       "      <td>2007-11-15 00:00:00</td>\n",
       "      <td>[(5.0, 0.92), (23.0, 0.07)]</td>\n",
       "      <td>[(226, 0.0448206753564123), (56, 0.00942625642...</td>\n",
       "      <td>[(318517.0, 0.237299772509975), (532303.0, 0.2...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.92, 0.0, 0.0, 0.0,...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
       "      <td>(0.0, 0.0, 0.0, 0.0, 0.0, 2.436510567122802, 0...</td>\n",
       "      <td>7615</td>\n",
       "      <td>12</td>\n",
       "      <td>0.0</td>\n",
       "      <td>None</td>\n",
       "      <td>18525</td>\n",
       "      <td>82844</td>\n",
       "      <td>0.223613</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>10</td>\n",
       "      <td>63</td>\n",
       "      <td>442.476190</td>\n",
       "      <td>27876</td>\n",
       "      <td>147388</td>\n",
       "      <td>0.189133</td>\n",
       "      <td>US</td>\n",
       "      <td>NY</td>\n",
       "      <td>3135.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   document_id            uuid   ad_id  display_id  clicked  is_train  \\\n",
       "0         5540  bf67d31751a2b4  106789     1993509        0         1   \n",
       "1      1683252  7e86be74917d24  106789    21859752        0         0   \n",
       "2        13319  55cbcc88990fd2   56860    10356183        0         1   \n",
       "3      1683252  f88f087861ca26  106789    22094475        0         0   \n",
       "4        30671  1661a9463dfa61  106789      909524        0         1   \n",
       "\n",
       "            timestamp platform geo_location source_id publisher_id  \\\n",
       "0 2016-06-15 21:43:52        1    US>UT>770       121          655   \n",
       "1 2016-06-28 17:07:29        1    US>MD>512      7736          450   \n",
       "2 2016-06-22 09:10:14        1    US>GA>524        31          535   \n",
       "3 2016-06-28 20:05:54        1    US>MI>505      7736          450   \n",
       "4 2016-06-15 01:25:40        1    US>NY>514      2609            3   \n",
       "\n",
       "         publish_time                                 categories  \\\n",
       "0 2010-05-19 04:00:00  [(23.0, 0.069408514), (5.0, 0.912226184)]   \n",
       "1 2016-06-08 19:00:00                [(41.0, 0.92), (1.0, 0.07)]   \n",
       "2                 NaT                                       None   \n",
       "3 2016-06-08 19:00:00                [(41.0, 0.92), (1.0, 0.07)]   \n",
       "4 2007-11-15 00:00:00                [(5.0, 0.92), (23.0, 0.07)]   \n",
       "\n",
       "                                              topics  \\\n",
       "0  [(143, 0.0677015377172056), (92, 0.01556335560...   \n",
       "1  [(85, 0.0264917522989691), (20, 0.089020019477...   \n",
       "2                        [(271, 0.0346064878342803)]   \n",
       "3  [(85, 0.0264917522989691), (20, 0.089020019477...   \n",
       "4  [(226, 0.0448206753564123), (56, 0.00942625642...   \n",
       "\n",
       "                                            entities  \\\n",
       "0                                               None   \n",
       "1                                               None   \n",
       "2                       [(131.0, 0.559219447002035)]   \n",
       "3                                               None   \n",
       "4  [(318517.0, 0.237299772509975), (532303.0, 0.2...   \n",
       "\n",
       "                                       topics_vector  \\\n",
       "0  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "1  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "2  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "3  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "4  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "\n",
       "                                   categories_vector  \\\n",
       "0  (0.0, 0.0, 0.0, 0.0, 0.0, 0.912226184, 0.0, 0....   \n",
       "1  (0.0, 0.07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
       "2  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "3  (0.0, 0.07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
       "4  (0.0, 0.0, 0.0, 0.0, 0.0, 0.92, 0.0, 0.0, 0.0,...   \n",
       "\n",
       "                                          topics_idf  \\\n",
       "0  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "1  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "2  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "3  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "4  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
       "\n",
       "                                      categories_idf  campaign_id  \\\n",
       "0  (0.0, 0.0, 0.0, 0.0, 0.0, 2.415922540132727, 0...         7615   \n",
       "1  (0.0, 0.1395557773478285, 0.0, 0.0, 0.0, 0.0, ...         7615   \n",
       "2  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...         7615   \n",
       "3  (0.0, 0.1395557773478285, 0.0, 0.0, 0.0, 0.0, ...         7615   \n",
       "4  (0.0, 0.0, 0.0, 0.0, 0.0, 2.436510567122802, 0...         7615   \n",
       "\n",
       "   advertiser_id  page_view_count_by_uuid page_view_count_by_document_id  \\\n",
       "0             12                      NaN                           None   \n",
       "1             12                      NaN                           None   \n",
       "2             12                      0.0                           None   \n",
       "3             12                      NaN                           None   \n",
       "4             12                      0.0                           None   \n",
       "\n",
       "   clicks_by_ad_id  events_by_ad_id  avg_ctr_by_ad_id  clicks_by_campaign_id  \\\n",
       "0            18525            82844          0.223613                  27876   \n",
       "1            18525            82844          0.223613                  27876   \n",
       "2             1294             6777          0.190940                  27876   \n",
       "3            18525            82844          0.223613                  27876   \n",
       "4            18525            82844          0.223613                  27876   \n",
       "\n",
       "   events_by_campaign_id  avg_ctr_by_campaign_id  clicks_by_document_id  \\\n",
       "0                 147388                0.189133                      2   \n",
       "1                 147388                0.189133                   3617   \n",
       "2                 147388                0.189133                     22   \n",
       "3                 147388                0.189133                   3617   \n",
       "4                 147388                0.189133                     10   \n",
       "\n",
       "   events_by_document_id  avg_ctr_by_document_id  clicks_by_advertiser_id  \\\n",
       "0                     16             1742.250000                    27876   \n",
       "1                  25551                1.090994                    27876   \n",
       "2                     98              284.448980                    27876   \n",
       "3                  25551                1.090994                    27876   \n",
       "4                     63              442.476190                    27876   \n",
       "\n",
       "   events_by_advertiser_id  avg_ctr_by_advertiser_id country state  doc_age  \\\n",
       "0                   147388                  0.189133      US    UT   2219.0   \n",
       "1                   147388                  0.189133      US    MD     20.0   \n",
       "2                   147388                  0.189133      US    GA      NaN   \n",
       "3                   147388                  0.189133      US    MI     20.0   \n",
       "4                   147388                  0.189133      US    NY   3135.0   \n",
       "\n",
       "  session  \n",
       "0       5  \n",
       "1       1  \n",
       "2       1  \n",
       "3       4  \n",
       "4       1  "
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%show df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Detect and impute null values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                             Count       pct   dtype\n",
      "platform                         0  0.000000  string\n",
      "source_id                     6179  0.000052  string\n",
      "publisher_id                  6185  0.000052  string\n",
      "topics_idf                       0  0.000000  vector\n",
      "categories_idf                   0  0.000000  vector\n",
      "avg_ctr_by_ad_id                 0  0.000000  double\n",
      "avg_ctr_by_campaign_id           0  0.000000  double\n",
      "avg_ctr_by_document_id           0  0.000000  double\n",
      "avg_ctr_by_advertiser_id         0  0.000000  double\n",
      "country                       1769  0.000015  string\n",
      "state                      5876820  0.049233  string\n",
      "doc_age                   16600181  0.139069     int\n",
      "session                          0  0.000000  string\n",
      "ad_id                            0  0.000000     int\n",
      "display_id                       0  0.000000     int\n",
      "is_train                         0  0.000000     int\n",
      "clicked                          0  0.000000     int\n"
     ]
    }
   ],
   "source": [
    "def show_null_counts(df):\n",
    "    null_testers = [\"sum(if(isnull(%s), 1, 0)) %s\" % (f, f) for f in df.columns]\n",
    "    null_counts = df.selectExpr(*null_testers).toPandas().T\n",
    "    null_counts.columns = [\"Count\"]\n",
    "    null_counts[\"pct\"] = null_counts.Count/df.count()\n",
    "    null_counts[\"dtype\"] = [t[1] for t in df.dtypes]\n",
    "    print(null_counts.to_string())\n",
    "\n",
    "show_null_counts(df.selectExpr(*features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                        0\n",
      "approx_count_distinct(platform)                         4\n",
      "approx_count_distinct(source_id)                     4801\n",
      "approx_count_distinct(publisher_id)                   481\n",
      "approx_count_distinct(topics_idf)                  749231\n",
      "approx_count_distinct(categories_idf)              315105\n",
      "approx_count_distinct(avg_ctr_by_ad_id)             28154\n",
      "approx_count_distinct(avg_ctr_by_campaign_id)       16182\n",
      "approx_count_distinct(avg_ctr_by_document_id)     6658183\n",
      "approx_count_distinct(avg_ctr_by_advertiser_id)     18926\n",
      "approx_count_distinct(country)                        222\n",
      "approx_count_distinct(state)                          408\n",
      "approx_count_distinct(doc_age)                       7441\n",
      "approx_count_distinct(session)                          8\n",
      "approx_count_distinct(ad_id)                       535318\n",
      "approx_count_distinct(display_id)                23137934\n",
      "approx_count_distinct(is_train)                         2\n",
      "approx_count_distinct(clicked)                          2\n"
     ]
    }
   ],
   "source": [
    "df_trunc = df.selectExpr(*features)\n",
    "distinct_counts = df_trunc.selectExpr(*[\"approx_count_distinct(%s)\"\n",
    "                        % f for f in df.selectExpr(*features).columns]).toPandas()\n",
    "print(distinct_counts.T.to_string())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                          Count  pct   dtype\n",
      "platform                      0  0.0  string\n",
      "source_id                     0  0.0  string\n",
      "publisher_id                  0  0.0  string\n",
      "topics_idf                    0  0.0  vector\n",
      "categories_idf                0  0.0  vector\n",
      "avg_ctr_by_ad_id              0  0.0  double\n",
      "avg_ctr_by_campaign_id        0  0.0  double\n",
      "avg_ctr_by_document_id        0  0.0  double\n",
      "avg_ctr_by_advertiser_id      0  0.0  double\n",
      "country                       0  0.0  string\n",
      "state                         0  0.0  string\n",
      "doc_age                       0  0.0     int\n",
      "session                       0  0.0  string\n",
      "ad_id                         0  0.0     int\n",
      "display_id                    0  0.0     int\n",
      "is_train                      0  0.0     int\n",
      "clicked                       0  0.0     int\n"
     ]
    }
   ],
   "source": [
    "fill_na_values = {\"platform\": \"<null>\"\n",
    "    , \"source_id\": \"<null>\"\n",
    "    , \"publisher_id\": \"<null>\"\n",
    "    , \"avg_ctr_by_ad_id\": 0.0\n",
    "    , \"avg_ctr_by_campaign_id\": 0.0\n",
    "    , \"avg_ctr_by_document_id\": 0.0\n",
    "    , \"avg_ctr_by_advertiser_id\": 0.0\n",
    "    , \"country\": \"null\"\n",
    "    , \"state\": \"null\"\n",
    "    , \"doc_age\": -1}\n",
    "\n",
    "df_null_removed = df.selectExpr(*features).na.fill(fill_na_values)\n",
    "show_null_counts(df_null_removed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply StringIndexing and OneHotE Convert categorical values (string type) into index values, subsequently to one hot encoded values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_columns = [col for col, dtype in df_null_removed.dtypes if dtype == \"string\"]\n",
    "\n",
    "df_string_indexed = df_null_removed\n",
    "for col in categorical_columns:\n",
    "    indexer = StringIndexer(inputCol=col, outputCol=\"%s_index\" % col)\n",
    "    df_string_indexed = indexer.fit(df_string_indexed).transform(df_string_indexed)\n",
    "\n",
    "one_hot_estimator = OneHotEncoderEstimator(\n",
    "    inputCols = [col + \"_index\" for col in categorical_columns], \n",
    "    outputCols = [col + \"_vec\" for col in categorical_columns]\n",
    ")\n",
    "df_ohe = one_hot_estimator.fit(df_string_indexed).transform(df_string_indexed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('platform', 'string'),\n",
       " ('source_id', 'string'),\n",
       " ('publisher_id', 'string'),\n",
       " ('topics_idf', 'vector'),\n",
       " ('categories_idf', 'vector'),\n",
       " ('avg_ctr_by_ad_id', 'double'),\n",
       " ('avg_ctr_by_campaign_id', 'double'),\n",
       " ('avg_ctr_by_document_id', 'double'),\n",
       " ('avg_ctr_by_advertiser_id', 'double'),\n",
       " ('country', 'string'),\n",
       " ('state', 'string'),\n",
       " ('doc_age', 'int'),\n",
       " ('session', 'string'),\n",
       " ('ad_id', 'int'),\n",
       " ('display_id', 'int'),\n",
       " ('is_train', 'int'),\n",
       " ('clicked', 'int'),\n",
       " ('platform_index', 'double'),\n",
       " ('source_id_index', 'double'),\n",
       " ('publisher_id_index', 'double'),\n",
       " ('country_index', 'double'),\n",
       " ('state_index', 'double'),\n",
       " ('session_index', 'double'),\n",
       " ('country_vec', 'vector'),\n",
       " ('session_vec', 'vector'),\n",
       " ('source_id_vec', 'vector'),\n",
       " ('state_vec', 'vector'),\n",
       " ('publisher_id_vec', 'vector'),\n",
       " ('platform_vec', 'vector')]"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ohe.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_be_vectorized = [('topics_idf', 'vector'),\n",
    " ('categories_idf', 'vector'),\n",
    " ('avg_ctr_by_ad_id', 'double'),\n",
    " ('avg_ctr_by_campaign_id', 'double'),\n",
    " ('avg_ctr_by_document_id', 'double'),\n",
    " ('avg_ctr_by_advertiser_id', 'double'),\n",
    " ('doc_age', 'int'),\n",
    " ('country_vec', 'vector'),\n",
    " ('session_vec', 'vector'),\n",
    " ('source_id_vec', 'vector'),\n",
    " ('state_vec', 'vector'),\n",
    " ('publisher_id_vec', 'vector'),\n",
    " ('platform_vec', 'vector')]\n",
    "\n",
    "vector_assembler = VectorAssembler(inputCols = [c for c, _ in to_be_vectorized], outputCol=\"features\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('platform', 'string'),\n",
       " ('source_id', 'string'),\n",
       " ('publisher_id', 'string'),\n",
       " ('topics_idf', 'vector'),\n",
       " ('categories_idf', 'vector'),\n",
       " ('avg_ctr_by_ad_id', 'double'),\n",
       " ('avg_ctr_by_campaign_id', 'double'),\n",
       " ('avg_ctr_by_document_id', 'double'),\n",
       " ('avg_ctr_by_advertiser_id', 'double'),\n",
       " ('country', 'string'),\n",
       " ('state', 'string'),\n",
       " ('doc_age', 'int'),\n",
       " ('session', 'string'),\n",
       " ('ad_id', 'int'),\n",
       " ('display_id', 'int'),\n",
       " ('is_train', 'int'),\n",
       " ('clicked', 'int'),\n",
       " ('platform_index', 'double'),\n",
       " ('source_id_index', 'double'),\n",
       " ('publisher_id_index', 'double'),\n",
       " ('country_index', 'double'),\n",
       " ('state_index', 'double'),\n",
       " ('session_index', 'double'),\n",
       " ('country_vec', 'vector'),\n",
       " ('session_vec', 'vector'),\n",
       " ('source_id_vec', 'vector'),\n",
       " ('state_vec', 'vector'),\n",
       " ('publisher_id_vec', 'vector'),\n",
       " ('platform_vec', 'vector'),\n",
       " ('features', 'vector')]"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_vectorized = vector_assembler.transform(df_ohe)\n",
    "df_vectorized.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- display_id: integer (nullable = true)\n",
      " |-- ad_id: integer (nullable = true)\n",
      " |-- clicked: integer (nullable = true)\n",
      " |-- features: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_train, df_test = df_vectorized.filter(\"is_train = 1\").select(\"display_id\", \"ad_id\", \"clicked\", \"features\")\\\n",
    ".randomSplit(weights=[0.7, 0.3], seed = 1)\n",
    "\n",
    "cache_df(df_train, \"df_train\")\n",
    "df_train.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60993108"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Coefficients: (6487,[397,398],[2.01186440125852,0.5053783000313586])\n",
      "Intercept: -1.8179425814912373\n",
      "objectiveHistory:\n",
      "0.49149536243626407\n",
      "0.49071853085585954\n",
      "0.4900603897242247\n",
      "0.4896705177876703\n",
      "0.4894437534510252\n",
      "0.4892235342895401\n",
      "0.48764606194710963\n",
      "0.4862003489762839\n",
      "0.4857844848265789\n",
      "0.4856612186190992\n",
      "0.4856568026282702\n",
      "+--------------------+--------------------+\n",
      "|                 FPR|                 TPR|\n",
      "+--------------------+--------------------+\n",
      "|                 0.0|                 0.0|\n",
      "|                 0.0|6.036929653642272E-4|\n",
      "|2.155515447772598...|0.002933383992409266|\n",
      "|0.003406796130165561|0.024339891245529235|\n",
      "|0.006615059548020476|  0.0423301278599859|\n",
      "|0.007392984890123408|0.046380906811004395|\n",
      "|0.011738378780732616| 0.06657816566736899|\n",
      "| 0.01268628911616994| 0.07072520024896091|\n",
      "|0.018890151712013213| 0.09525413179622319|\n",
      "|0.019317920167727202| 0.09679710024198514|\n",
      "| 0.02547930684837005|  0.1194485813765588|\n",
      "| 0.03375214707719751| 0.14759594578395258|\n",
      "| 0.04134797530386444| 0.17088464427237515|\n",
      "| 0.04158491730474072| 0.17156673012596366|\n",
      "| 0.04967203469515713| 0.19500239411542006|\n",
      "| 0.05529973762938278| 0.20996028206540032|\n",
      "| 0.05535374243109284| 0.21012214729464956|\n",
      "| 0.06314121043792445|  0.2298204345708912|\n",
      "| 0.06322376295861076|  0.2300147236405183|\n",
      "| 0.07142696220631738|  0.2504988022650296|\n",
      "+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n",
      "areaUnderROC: 0.7220436034660719\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.classification import LogisticRegression\n",
    "lr = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=0.8, featuresCol=\"features\", labelCol=\"clicked\")\n",
    "\n",
    "lrModel = lr.fit(df_train)\n",
    "\n",
    "print(\"Coefficients: \" + str(lrModel.coefficients))\n",
    "print(\"Intercept: \" + str(lrModel.intercept))\n",
    "\n",
    "trainingSummary = lrModel.summary\n",
    "objectiveHistory = trainingSummary.objectiveHistory\n",
    "print(\"objectiveHistory:\")\n",
    "for objective in objectiveHistory:\n",
    "    print(objective)\n",
    "\n",
    "# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.\n",
    "trainingSummary.roc.show()\n",
    "print(\"areaUnderROC: \" + str(trainingSummary.areaUnderROC))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False positive rate by label:\n",
      "label 0: 0.9992980196229421\n",
      "label 1: 3.049969976095555e-07\n",
      "True positive rate by label:\n",
      "label 0: 0.9999996950030023\n",
      "label 1: 0.0007019803770579403\n",
      "Precision by label:\n",
      "label 0: 0.8064435104084376\n",
      "label 1: 0.9981942939689419\n",
      "Recall by label:\n",
      "label 0: 0.9999996950030023\n",
      "label 1: 0.0007019803770579403\n",
      "F-measure by label:\n",
      "label 0: 0.8928520553868292\n",
      "label 1: 0.0014029741122343758\n",
      "Accuracy: 0.8064696260436507\n",
      "FPR: 0.8057679506635904\n",
      "TPR: 0.8064696260436507\n",
      "F-measure: 0.7202086082461527\n",
      "Precision: 0.843579132578218\n",
      "Recall: 0.8064696260436507\n"
     ]
    }
   ],
   "source": [
    "# for multiclass, we can inspect metrics on a per-label basis\n",
    "print(\"False positive rate by label:\")\n",
    "for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):\n",
    "    print(\"label %d: %s\" % (i, rate))\n",
    "\n",
    "print(\"True positive rate by label:\")\n",
    "for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):\n",
    "    print(\"label %d: %s\" % (i, rate))\n",
    "\n",
    "print(\"Precision by label:\")\n",
    "for i, prec in enumerate(trainingSummary.precisionByLabel):\n",
    "    print(\"label %d: %s\" % (i, prec))\n",
    "\n",
    "print(\"Recall by label:\")\n",
    "for i, rec in enumerate(trainingSummary.recallByLabel):\n",
    "    print(\"label %d: %s\" % (i, rec))\n",
    "\n",
    "print(\"F-measure by label:\")\n",
    "for i, f in enumerate(trainingSummary.fMeasureByLabel()):\n",
    "    print(\"label %d: %s\" % (i, f))\n",
    "\n",
    "accuracy = trainingSummary.accuracy\n",
    "falsePositiveRate = trainingSummary.weightedFalsePositiveRate\n",
    "truePositiveRate = trainingSummary.weightedTruePositiveRate\n",
    "fMeasure = trainingSummary.weightedFMeasure()\n",
    "precision = trainingSummary.weightedPrecision\n",
    "recall = trainingSummary.weightedRecall\n",
    "print(\"Accuracy: %s\\nFPR: %s\\nTPR: %s\\nF-measure: %s\\nPrecision: %s\\nRecall: %s\"\n",
    "      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "lrModel.write().overwrite().save(base_path + \"lrModel\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-------+--------------------+\n",
      "|prediction|clicked|            features|\n",
      "+----------+-------+--------------------+\n",
      "|       0.0|      1|(6487,[20,97,241,...|\n",
      "|       0.0|      0|(6487,[8,56,93,10...|\n",
      "|       0.0|      0|(6487,[183,199,30...|\n",
      "|       0.0|      0|(6487,[71,199,229...|\n",
      "|       0.0|      0|(6487,[16,29,65,1...|\n",
      "+----------+-------+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Make predictions.\n",
    "predictions = lrModel.transform(df_test)\n",
    "\n",
    "# Select example rows to display.\n",
    "predictions.select(\"prediction\", \"clicked\", \"features\").show(5)\n",
    "\n",
    "# Select (prediction, true label) and compute test error\n",
    "evaluator = MulticlassClassificationEvaluator(\n",
    "    labelCol=\"clicked\", predictionCol=\"prediction\", metricName=\"accuracy\")\n",
    "accuracy = evaluator.evaluate(predictions)\n",
    "print(\"Accuracy = %g \" % (accuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
